linux/net/core/net_namespace.c

489 lines
11 KiB
C
Raw Normal View History

#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
/*
* Our network namespace constructor/destructor lists
*/
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
static DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
struct net init_net;
EXPORT_SYMBOL(init_net);
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
/*
* setup_net runs the initializers for the network namespace object.
*/
static __net_init int setup_net(struct net *net)
{
/* Must be called with net_mutex held */
struct pernet_operations *ops;
int error;
struct net_generic *ng;
atomic_set(&net->count, 1);
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&net->use_count, 0);
#endif
error = -ENOMEM;
ng = kzalloc(sizeof(struct net_generic) +
INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL);
if (ng == NULL)
goto out;
ng->len = INITIAL_NET_GEN_PTRS;
rcu_assign_pointer(net->gen, ng);
error = 0;
list_for_each_entry(ops, &pernet_list, list) {
if (ops->init) {
error = ops->init(net);
if (error < 0)
goto out_undo;
}
}
out:
return error;
out_undo:
/* Walk through the list backwards calling the exit functions
* for the pernet modules whose init functions did not fail.
*/
list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
if (ops->exit)
ops->exit(net);
}
rcu_barrier();
kfree(ng);
goto out;
}
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;
static struct net *net_alloc(void)
{
return kmem_cache_zalloc(net_cachep, GFP_KERNEL);
}
static void net_free(struct net *net)
{
if (!net)
return;
#ifdef NETNS_REFCNT_DEBUG
if (unlikely(atomic_read(&net->use_count) != 0)) {
printk(KERN_EMERG "network namespace not free! Usage: %d\n",
atomic_read(&net->use_count));
return;
}
#endif
kfree(net->gen);
kmem_cache_free(net_cachep, net);
}
struct net *copy_net_ns(unsigned long flags, struct net *old_net)
{
struct net *new_net = NULL;
int err;
get_net(old_net);
if (!(flags & CLONE_NEWNET))
return old_net;
err = -ENOMEM;
new_net = net_alloc();
if (!new_net)
goto out;
mutex_lock(&net_mutex);
err = setup_net(new_net);
if (err)
goto out_unlock;
rtnl_lock();
list_add_tail(&new_net->list, &net_namespace_list);
rtnl_unlock();
out_unlock:
mutex_unlock(&net_mutex);
out:
put_net(old_net);
if (err) {
net_free(new_net);
new_net = ERR_PTR(err);
}
return new_net;
}
static void cleanup_net(struct work_struct *work)
{
struct pernet_operations *ops;
struct net *net;
netns: Don't receive new packets in a dead network namespace. Alexey Dobriyan <adobriyan@gmail.com> writes: > Subject: ICMP sockets destruction vs ICMP packets oops > After icmp_sk_exit() nuked ICMP sockets, we get an interrupt. > icmp_reply() wants ICMP socket. > > Steps to reproduce: > > launch shell in new netns > move real NIC to netns > setup routing > ping -i 0 > exit from shell > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [<ffffffff803fce17>] icmp_sk+0x17/0x30 > PGD 17f3cd067 PUD 17f3ce067 PMD 0 > Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC > CPU 0 > Modules linked in: usblp usbcore > Pid: 0, comm: swapper Not tainted 2.6.26-rc6-netns-ct #4 > RIP: 0010:[<ffffffff803fce17>] [<ffffffff803fce17>] icmp_sk+0x17/0x30 > RSP: 0018:ffffffff8057fc30 EFLAGS: 00010286 > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff81017c7db900 > RDX: 0000000000000034 RSI: ffff81017c7db900 RDI: ffff81017dc41800 > RBP: ffffffff8057fc40 R08: 0000000000000001 R09: 000000000000a815 > R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8057fd28 > R13: ffffffff8057fd00 R14: ffff81017c7db938 R15: ffff81017dc41800 > FS: 0000000000000000(0000) GS:ffffffff80525000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 0000000000000000 CR3: 000000017fcda000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 0, threadinfo ffffffff8053a000, task ffffffff804fa4a0) > Stack: 0000000000000000 ffff81017c7db900 ffffffff8057fcf0 ffffffff803fcfe4 > ffffffff804faa38 0000000000000246 0000000000005a40 0000000000000246 > 000000000001ffff ffff81017dd68dc0 0000000000005a40 0000000055342436 > Call Trace: > <IRQ> [<ffffffff803fcfe4>] icmp_reply+0x44/0x1e0 > [<ffffffff803d3a0a>] ? ip_route_input+0x23a/0x1360 > [<ffffffff803fd645>] icmp_echo+0x65/0x70 > [<ffffffff803fd300>] icmp_rcv+0x180/0x1b0 > [<ffffffff803d6d84>] ip_local_deliver+0xf4/0x1f0 > [<ffffffff803d71bb>] ip_rcv+0x33b/0x650 > [<ffffffff803bb16a>] netif_receive_skb+0x27a/0x340 > [<ffffffff803be57d>] process_backlog+0x9d/0x100 > [<ffffffff803bdd4d>] net_rx_action+0x18d/0x250 > [<ffffffff80237be5>] __do_softirq+0x75/0x100 > [<ffffffff8020c97c>] call_softirq+0x1c/0x30 > [<ffffffff8020f085>] do_softirq+0x65/0xa0 > [<ffffffff80237af7>] irq_exit+0x97/0xa0 > [<ffffffff8020f198>] do_IRQ+0xa8/0x130 > [<ffffffff80212ee0>] ? mwait_idle+0x0/0x60 > [<ffffffff8020bc46>] ret_from_intr+0x0/0xf > <EOI> [<ffffffff80212f2c>] ? mwait_idle+0x4c/0x60 > [<ffffffff80212f23>] ? mwait_idle+0x43/0x60 > [<ffffffff8020a217>] ? cpu_idle+0x57/0xa0 > [<ffffffff8040f380>] ? rest_init+0x70/0x80 > Code: 10 5b 41 5c 41 5d 41 5e c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 > 48 83 ec 08 48 8b 9f 78 01 00 00 e8 2b c7 f1 ff 89 c0 <48> 8b 04 c3 48 83 c4 08 > 5b c9 c3 66 66 66 66 66 2e 0f 1f 84 00 > RIP [<ffffffff803fce17>] icmp_sk+0x17/0x30 > RSP <ffffffff8057fc30> > CR2: 0000000000000000 > ---[ end trace ea161157b76b33e8 ]--- > Kernel panic - not syncing: Aiee, killing interrupt handler! Receiving packets while we are cleaning up a network namespace is a racy proposition. It is possible when the packet arrives that we have removed some but not all of the state we need to fully process it. We have the choice of either playing wack-a-mole with the cleanup routines or simply dropping packets when we don't have a network namespace to handle them. Since the check looks inexpensive in netif_receive_skb let's just drop the incoming packets. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-21 13:16:51 +08:00
/* Be very certain incoming network packets will not find us */
rcu_barrier();
net = container_of(work, struct net, work);
mutex_lock(&net_mutex);
/* Don't let anyone else find us. */
rtnl_lock();
list_del(&net->list);
rtnl_unlock();
/* Run all of the network namespace exit methods */
list_for_each_entry_reverse(ops, &pernet_list, list) {
if (ops->exit)
ops->exit(net);
}
mutex_unlock(&net_mutex);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
*/
rcu_barrier();
/* Finally it is safe to free my network namespace structure */
net_free(net);
}
void __put_net(struct net *net)
{
/* Cleanup the network namespace in process context */
INIT_WORK(&net->work, cleanup_net);
queue_work(netns_wq, &net->work);
}
EXPORT_SYMBOL_GPL(__put_net);
#else
struct net *copy_net_ns(unsigned long flags, struct net *old_net)
{
if (flags & CLONE_NEWNET)
return ERR_PTR(-EINVAL);
return old_net;
}
#endif
static int __init net_ns_init(void)
{
int err;
printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC, NULL);
/* Create workqueue for cleanup */
netns_wq = create_singlethread_workqueue("netns");
if (!netns_wq)
panic("Could not create netns workq");
#endif
mutex_lock(&net_mutex);
err = setup_net(&init_net);
rtnl_lock();
list_add_tail(&init_net.list, &net_namespace_list);
rtnl_unlock();
mutex_unlock(&net_mutex);
if (err)
panic("Could not setup the initial network namespace");
return 0;
}
pure_initcall(net_ns_init);
#ifdef CONFIG_NET_NS
static int register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
struct net *net, *undo_net;
int error;
list_add_tail(&ops->list, list);
if (ops->init) {
for_each_net(net) {
error = ops->init(net);
if (error)
goto out_undo;
}
}
return 0;
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
if (ops->exit) {
for_each_net(undo_net) {
if (undo_net == net)
goto undone;
ops->exit(undo_net);
}
}
undone:
return error;
}
static void unregister_pernet_operations(struct pernet_operations *ops)
{
struct net *net;
list_del(&ops->list);
if (ops->exit)
for_each_net(net)
ops->exit(net);
}
#else
static int register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
if (ops->init == NULL)
return 0;
return ops->init(&init_net);
}
static void unregister_pernet_operations(struct pernet_operations *ops)
{
if (ops->exit)
ops->exit(&init_net);
}
#endif
static DEFINE_IDA(net_generic_ids);
/**
* register_pernet_subsys - register a network namespace subsystem
* @ops: pernet operations structure for the subsystem
*
* Register a subsystem which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
error = register_pernet_operations(first_device, ops);
mutex_unlock(&net_mutex);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
/**
* unregister_pernet_subsys - unregister a network namespace subsystem
* @ops: pernet operations structure to manipulate
*
* Remove the pernet operations structure from the list to be
* used when network namespaces are created or destroyed. In
* addition run the exit method for all existing network
* namespaces.
*/
void unregister_pernet_subsys(struct pernet_operations *module)
{
mutex_lock(&net_mutex);
unregister_pernet_operations(module);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
int register_pernet_gen_subsys(int *id, struct pernet_operations *ops)
{
int rv;
mutex_lock(&net_mutex);
again:
rv = ida_get_new_above(&net_generic_ids, 1, id);
if (rv < 0) {
if (rv == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
goto again;
}
goto out;
}
rv = register_pernet_operations(first_device, ops);
if (rv < 0)
ida_remove(&net_generic_ids, *id);
mutex_unlock(&net_mutex);
out:
return rv;
}
EXPORT_SYMBOL_GPL(register_pernet_gen_subsys);
void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops)
{
mutex_lock(&net_mutex);
unregister_pernet_operations(ops);
ida_remove(&net_generic_ids, id);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys);
/**
* register_pernet_device - register a network namespace device
* @ops: pernet operations structure for the subsystem
*
* Register a device which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
int register_pernet_device(struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
mutex_unlock(&net_mutex);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
int register_pernet_gen_device(int *id, struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
again:
error = ida_get_new_above(&net_generic_ids, 1, id);
if (error) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
goto again;
}
goto out;
}
error = register_pernet_operations(&pernet_list, ops);
if (error)
ida_remove(&net_generic_ids, *id);
else if (first_device == &pernet_list)
first_device = &ops->list;
out:
mutex_unlock(&net_mutex);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_gen_device);
/**
* unregister_pernet_device - unregister a network namespace netdevice
* @ops: pernet operations structure to manipulate
*
* Remove the pernet operations structure from the list to be
* used when network namespaces are created or destroyed. In
* addition run the exit method for all existing network
* namespaces.
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
mutex_lock(&net_mutex);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
void unregister_pernet_gen_device(int id, struct pernet_operations *ops)
{
mutex_lock(&net_mutex);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
ida_remove(&net_generic_ids, id);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_gen_device);
static void net_generic_release(struct rcu_head *rcu)
{
struct net_generic *ng;
ng = container_of(rcu, struct net_generic, rcu);
kfree(ng);
}
int net_assign_generic(struct net *net, int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
BUG_ON(id == 0);
ng = old_ng = net->gen;
if (old_ng->len >= id)
goto assign;
ng = kzalloc(sizeof(struct net_generic) +
id * sizeof(void *), GFP_KERNEL);
if (ng == NULL)
return -ENOMEM;
/*
* Some synchronisation notes:
*
* The net_generic explores the net->gen array inside rcu
* read section. Besides once set the net->gen->ptr[x]
* pointer never changes (see rules in netns/generic.h).
*
* That said, we simply duplicate this array and schedule
* the old copy for kfree after a grace period.
*/
ng->len = id;
memcpy(&ng->ptr, &old_ng->ptr, old_ng->len);
rcu_assign_pointer(net->gen, ng);
call_rcu(&old_ng->rcu, net_generic_release);
assign:
ng->ptr[id - 1] = data;
return 0;
}
EXPORT_SYMBOL_GPL(net_assign_generic);