Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
pid_namespace.c
Go to the documentation of this file.
1 /*
2  * Pid namespaces
3  *
4  * Authors:
5  * (C) 2007 Pavel Emelyanov <[email protected]>, OpenVZ, SWsoft Inc.
6  * (C) 2007 Sukadev Bhattiprolu <[email protected]>, IBM
7  * Many thanks to Oleg Nesterov for comments and help
8  *
9  */
10 
11 #include <linux/pid.h>
12 #include <linux/pid_namespace.h>
13 #include <linux/syscalls.h>
14 #include <linux/err.h>
15 #include <linux/acct.h>
16 #include <linux/slab.h>
17 #include <linux/proc_fs.h>
18 #include <linux/reboot.h>
19 #include <linux/export.h>
20 
21 #define BITS_PER_PAGE (PAGE_SIZE*8)
22 
23 struct pid_cache {
24  int nr_ids;
25  char name[16];
26  struct kmem_cache *cachep;
27  struct list_head list;
28 };
29 
30 static LIST_HEAD(pid_caches_lh);
31 static DEFINE_MUTEX(pid_caches_mutex);
32 static struct kmem_cache *pid_ns_cachep;
33 
34 /*
35  * creates the kmem cache to allocate pids from.
36  * @nr_ids: the number of numerical ids this pid will have to carry
37  */
38 
39 static struct kmem_cache *create_pid_cachep(int nr_ids)
40 {
41  struct pid_cache *pcache;
42  struct kmem_cache *cachep;
43 
44  mutex_lock(&pid_caches_mutex);
45  list_for_each_entry(pcache, &pid_caches_lh, list)
46  if (pcache->nr_ids == nr_ids)
47  goto out;
48 
50  if (pcache == NULL)
51  goto err_alloc;
52 
53  snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
54  cachep = kmem_cache_create(pcache->name,
55  sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
57  if (cachep == NULL)
58  goto err_cachep;
59 
60  pcache->nr_ids = nr_ids;
61  pcache->cachep = cachep;
62  list_add(&pcache->list, &pid_caches_lh);
63 out:
64  mutex_unlock(&pid_caches_mutex);
65  return pcache->cachep;
66 
67 err_cachep:
68  kfree(pcache);
69 err_alloc:
70  mutex_unlock(&pid_caches_mutex);
71  return NULL;
72 }
73 
74 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
75 #define MAX_PID_NS_LEVEL 32
76 
77 static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
78 {
79  struct pid_namespace *ns;
80  unsigned int level = parent_pid_ns->level + 1;
81  int i;
82  int err;
83 
84  if (level > MAX_PID_NS_LEVEL) {
85  err = -EINVAL;
86  goto out;
87  }
88 
89  err = -ENOMEM;
90  ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
91  if (ns == NULL)
92  goto out;
93 
94  ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
95  if (!ns->pidmap[0].page)
96  goto out_free;
97 
98  ns->pid_cachep = create_pid_cachep(level + 1);
99  if (ns->pid_cachep == NULL)
100  goto out_free_map;
101 
102  kref_init(&ns->kref);
103  ns->level = level;
104  ns->parent = get_pid_ns(parent_pid_ns);
105 
106  set_bit(0, ns->pidmap[0].page);
107  atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
108 
109  for (i = 1; i < PIDMAP_ENTRIES; i++)
110  atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
111 
112  err = pid_ns_prepare_proc(ns);
113  if (err)
114  goto out_put_parent_pid_ns;
115 
116  return ns;
117 
118 out_put_parent_pid_ns:
119  put_pid_ns(parent_pid_ns);
120 out_free_map:
121  kfree(ns->pidmap[0].page);
122 out_free:
123  kmem_cache_free(pid_ns_cachep, ns);
124 out:
125  return ERR_PTR(err);
126 }
127 
128 static void destroy_pid_namespace(struct pid_namespace *ns)
129 {
130  int i;
131 
132  for (i = 0; i < PIDMAP_ENTRIES; i++)
133  kfree(ns->pidmap[i].page);
134  kmem_cache_free(pid_ns_cachep, ns);
135 }
136 
137 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
138 {
139  if (!(flags & CLONE_NEWPID))
140  return get_pid_ns(old_ns);
141  if (flags & (CLONE_THREAD|CLONE_PARENT))
142  return ERR_PTR(-EINVAL);
143  return create_pid_namespace(old_ns);
144 }
145 
146 static void free_pid_ns(struct kref *kref)
147 {
148  struct pid_namespace *ns;
149 
150  ns = container_of(kref, struct pid_namespace, kref);
151  destroy_pid_namespace(ns);
152 }
153 
154 void put_pid_ns(struct pid_namespace *ns)
155 {
156  struct pid_namespace *parent;
157 
158  while (ns != &init_pid_ns) {
159  parent = ns->parent;
160  if (!kref_put(&ns->kref, free_pid_ns))
161  break;
162  ns = parent;
163  }
164 }
166 
168 {
169  int nr;
170  int rc;
171  struct task_struct *task, *me = current;
172 
173  /* Ignore SIGCHLD causing any terminated children to autoreap */
174  spin_lock_irq(&me->sighand->siglock);
175  me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
176  spin_unlock_irq(&me->sighand->siglock);
177 
178  /*
179  * The last thread in the cgroup-init thread group is terminating.
180  * Find remaining pid_ts in the namespace, signal and wait for them
181  * to exit.
182  *
183  * Note: This signals each threads in the namespace - even those that
184  * belong to the same thread group, To avoid this, we would have
185  * to walk the entire tasklist looking a processes in this
186  * namespace, but that could be unnecessarily expensive if the
187  * pid namespace has just a few processes. Or we need to
188  * maintain a tasklist for each pid namespace.
189  *
190  */
192  nr = next_pidmap(pid_ns, 1);
193  while (nr > 0) {
194  rcu_read_lock();
195 
196  task = pid_task(find_vpid(nr), PIDTYPE_PID);
197  if (task && !__fatal_signal_pending(task))
199 
200  rcu_read_unlock();
201 
202  nr = next_pidmap(pid_ns, nr);
203  }
205 
206  /* Firstly reap the EXIT_ZOMBIE children we may have. */
207  do {
208  clear_thread_flag(TIF_SIGPENDING);
209  rc = sys_wait4(-1, NULL, __WALL, NULL);
210  } while (rc != -ECHILD);
211 
212  /*
213  * sys_wait4() above can't reap the TASK_DEAD children.
214  * Make sure they all go away, see __unhash_process().
215  */
216  for (;;) {
217  bool need_wait = false;
218 
220  if (!list_empty(&current->children)) {
222  need_wait = true;
223  }
225 
226  if (!need_wait)
227  break;
228  schedule();
229  }
230 
231  if (pid_ns->reboot)
232  current->signal->group_exit_code = pid_ns->reboot;
233 
234  acct_exit_ns(pid_ns);
235  return;
236 }
237 
238 #ifdef CONFIG_CHECKPOINT_RESTORE
239 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
240  void __user *buffer, size_t *lenp, loff_t *ppos)
241 {
242  struct ctl_table tmp = *table;
243 
244  if (write && !capable(CAP_SYS_ADMIN))
245  return -EPERM;
246 
247  /*
248  * Writing directly to ns' last_pid field is OK, since this field
249  * is volatile in a living namespace anyway and a code writing to
250  * it should synchronize its usage with external means.
251  */
252 
253  tmp.data = &current->nsproxy->pid_ns->last_pid;
254  return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
255 }
256 
257 extern int pid_max;
258 static int zero = 0;
259 static struct ctl_table pid_ns_ctl_table[] = {
260  {
261  .procname = "ns_last_pid",
262  .maxlen = sizeof(int),
263  .mode = 0666, /* permissions are checked in the handler */
264  .proc_handler = pid_ns_ctl_handler,
265  .extra1 = &zero,
266  .extra2 = &pid_max,
267  },
268  { }
269 };
270 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
271 #endif /* CONFIG_CHECKPOINT_RESTORE */
272 
273 int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
274 {
275  if (pid_ns == &init_pid_ns)
276  return 0;
277 
278  switch (cmd) {
281  pid_ns->reboot = SIGHUP;
282  break;
283 
286  pid_ns->reboot = SIGINT;
287  break;
288  default:
289  return -EINVAL;
290  }
291 
293  force_sig(SIGKILL, pid_ns->child_reaper);
295 
296  do_exit(0);
297 
298  /* Not reached */
299  return 0;
300 }
301 
302 static __init int pid_namespaces_init(void)
303 {
304  pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
305 
306 #ifdef CONFIG_CHECKPOINT_RESTORE
307  register_sysctl_paths(kern_path, pid_ns_ctl_table);
308 #endif
309  return 0;
310 }
311 
312 __initcall(pid_namespaces_init);