Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
cgroup.c
Go to the documentation of this file.
1 /*
2  * Generic process-grouping system.
3  *
4  * Based originally on the cpuset system, extracted by Paul Menage
5  * Copyright (C) 2006 Google, Inc
6  *
7  * Notifications support
8  * Copyright (C) 2009 Nokia Corporation
9  * Author: Kirill A. Shutemov
10  *
11  * Copyright notices from the original cpuset code:
12  * --------------------------------------------------
13  * Copyright (C) 2003 BULL SA.
14  * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15  *
16  * Portions derived from Patrick Mochel's sysfs code.
17  * sysfs is Copyright (c) 2001-3 Patrick Mochel
18  *
19  * 2003-10-10 Written by Simon Derr.
20  * 2003-10-22 Updates by Stephen Hemminger.
21  * 2004 May-July Rework by Paul Jackson.
22  * ---------------------------------------------------
23  *
24  * This file is subject to the terms and conditions of the GNU General Public
25  * License. See the file COPYING in the main directory of the Linux
26  * distribution for more details.
27  */
28 
29 #include <linux/cgroup.h>
30 #include <linux/cred.h>
31 #include <linux/ctype.h>
32 #include <linux/errno.h>
33 #include <linux/fs.h>
34 #include <linux/init_task.h>
35 #include <linux/kernel.h>
36 #include <linux/list.h>
37 #include <linux/mm.h>
38 #include <linux/mutex.h>
39 #include <linux/mount.h>
40 #include <linux/pagemap.h>
41 #include <linux/proc_fs.h>
42 #include <linux/rcupdate.h>
43 #include <linux/sched.h>
44 #include <linux/backing-dev.h>
45 #include <linux/seq_file.h>
46 #include <linux/slab.h>
47 #include <linux/magic.h>
48 #include <linux/spinlock.h>
49 #include <linux/string.h>
50 #include <linux/sort.h>
51 #include <linux/kmod.h>
52 #include <linux/module.h>
53 #include <linux/delayacct.h>
54 #include <linux/cgroupstats.h>
55 #include <linux/hash.h>
56 #include <linux/namei.h>
57 #include <linux/pid_namespace.h>
58 #include <linux/idr.h>
59 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60 #include <linux/eventfd.h>
61 #include <linux/poll.h>
62 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
63 #include <linux/kthread.h>
64 
65 #include <linux/atomic.h>
66 
67 /* css deactivation bias, makes css->refcnt negative to deny new trygets */
68 #define CSS_DEACT_BIAS INT_MIN
69 
70 /*
71  * cgroup_mutex is the master lock. Any modification to cgroup or its
72  * hierarchy must be performed while holding it.
73  *
74  * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
75  * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
76  * release_agent_path and so on. Modifying requires both cgroup_mutex and
77  * cgroup_root_mutex. Readers can acquire either of the two. This is to
78  * break the following locking order cycle.
79  *
80  * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
81  * B. namespace_sem -> cgroup_mutex
82  *
83  * B happens only through cgroup_show_options() and using cgroup_root_mutex
84  * breaks it.
85  */
86 static DEFINE_MUTEX(cgroup_mutex);
87 static DEFINE_MUTEX(cgroup_root_mutex);
88 
89 /*
90  * Generate an array of cgroup subsystem pointers. At boot time, this is
91  * populated with the built in subsystems, and modular subsystems are
92  * registered after that. The mutable section of this array is protected by
93  * cgroup_mutex.
94  */
95 #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
96 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
97 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98 #include <linux/cgroup_subsys.h>
99 };
100 
101 #define MAX_CGROUP_ROOT_NAMELEN 64
102 
103 /*
104  * A cgroupfs_root represents the root of a cgroup hierarchy,
105  * and may be associated with a superblock to form an active
106  * hierarchy
107  */
109  struct super_block *sb;
110 
111  /*
112  * The bitmask of subsystems intended to be attached to this
113  * hierarchy
114  */
115  unsigned long subsys_mask;
116 
117  /* Unique id for this hierarchy. */
119 
120  /* The bitmask of subsystems currently attached to this hierarchy */
121  unsigned long actual_subsys_mask;
122 
123  /* A list running through the attached subsystems */
125 
126  /* The root cgroup for this hierarchy */
127  struct cgroup top_cgroup;
128 
129  /* Tracks how many cgroups are currently defined in hierarchy.*/
131 
132  /* A list running through the active hierarchies */
134 
135  /* All cgroups on this root, cgroup_mutex protected */
137 
138  /* Hierarchy-specific flags */
139  unsigned long flags;
140 
141  /* The path to use for release notifications. */
143 
144  /* The name for this hierarchy - may be empty */
146 };
147 
148 /*
149  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
150  * subsystems that are otherwise unattached - it never has more than a
151  * single cgroup, and all tasks are part of that cgroup.
152  */
153 static struct cgroupfs_root rootnode;
154 
155 /*
156  * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
157  */
158 struct cfent {
159  struct list_head node;
160  struct dentry *dentry;
161  struct cftype *type;
162 };
163 
164 /*
165  * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
166  * cgroup_subsys->use_id != 0.
167  */
168 #define CSS_ID_MAX (65535)
169 struct css_id {
170  /*
171  * The css to which this ID points. This pointer is set to valid value
172  * after cgroup is populated. If cgroup is removed, this will be NULL.
173  * This pointer is expected to be RCU-safe because destroy()
174  * is called after synchronize_rcu(). But for safe use, css_is_removed()
175  * css_tryget() should be used for avoiding race.
176  */
177  struct cgroup_subsys_state __rcu *css;
178  /*
179  * ID of this css.
180  */
181  unsigned short id;
182  /*
183  * Depth in hierarchy which this ID belongs to.
184  */
185  unsigned short depth;
186  /*
187  * ID is freed by RCU. (and lookup routine is RCU safe.)
188  */
190  /*
191  * Hierarchy of CSS ID belongs to.
192  */
193  unsigned short stack[0]; /* Array of Length (depth+1) */
194 };
195 
196 /*
197  * cgroup_event represents events which userspace want to receive.
198  */
199 struct cgroup_event {
200  /*
201  * Cgroup which the event belongs to.
202  */
203  struct cgroup *cgrp;
204  /*
205  * Control file which the event associated.
206  */
207  struct cftype *cft;
208  /*
209  * eventfd to signal userspace about the event.
210  */
212  /*
213  * Each of these stored in a list by the cgroup.
214  */
215  struct list_head list;
216  /*
217  * All fields below needed to unregister event when
218  * userspace closes eventfd.
219  */
224 };
225 
226 /* The list of hierarchy roots */
227 
228 static LIST_HEAD(roots);
229 static int root_count;
230 
231 static DEFINE_IDA(hierarchy_ida);
232 static int next_hierarchy_id;
233 static DEFINE_SPINLOCK(hierarchy_id_lock);
234 
235 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
236 #define dummytop (&rootnode.top_cgroup)
237 
238 /* This flag indicates whether tasks in the fork and exit paths should
239  * check for fork/exit handlers to call. This avoids us having to do
240  * extra work in the fork/exit path if none of the subsystems need to
241  * be called.
242  */
243 static int need_forkexit_callback __read_mostly;
244 
245 #ifdef CONFIG_PROVE_LOCKING
246 int cgroup_lock_is_held(void)
247 {
248  return lockdep_is_held(&cgroup_mutex);
249 }
250 #else /* #ifdef CONFIG_PROVE_LOCKING */
252 {
253  return mutex_is_locked(&cgroup_mutex);
254 }
255 #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
256 
258 
259 static int css_unbias_refcnt(int refcnt)
260 {
261  return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
262 }
263 
264 /* the current nr of refs, always >= 0 whether @css is deactivated or not */
265 static int css_refcnt(struct cgroup_subsys_state *css)
266 {
267  int v = atomic_read(&css->refcnt);
268 
269  return css_unbias_refcnt(v);
270 }
271 
272 /* convenient tests for these bits */
273 inline int cgroup_is_removed(const struct cgroup *cgrp)
274 {
275  return test_bit(CGRP_REMOVED, &cgrp->flags);
276 }
277 
278 /* bits in struct cgroupfs_root flags field */
279 enum {
280  ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
281  ROOT_XATTR, /* supports extended attributes */
282 };
283 
284 static int cgroup_is_releasable(const struct cgroup *cgrp)
285 {
286  const int bits =
287  (1 << CGRP_RELEASABLE) |
288  (1 << CGRP_NOTIFY_ON_RELEASE);
289  return (cgrp->flags & bits) == bits;
290 }
291 
292 static int notify_on_release(const struct cgroup *cgrp)
293 {
294  return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295 }
296 
297 static int clone_children(const struct cgroup *cgrp)
298 {
299  return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300 }
301 
302 /*
303  * for_each_subsys() allows you to iterate on each subsystem attached to
304  * an active hierarchy
305  */
306 #define for_each_subsys(_root, _ss) \
307 list_for_each_entry(_ss, &_root->subsys_list, sibling)
308 
309 /* for_each_active_root() allows you to iterate across the active hierarchies */
310 #define for_each_active_root(_root) \
311 list_for_each_entry(_root, &roots, root_list)
312 
313 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
314 {
315  return dentry->d_fsdata;
316 }
317 
318 static inline struct cfent *__d_cfe(struct dentry *dentry)
319 {
320  return dentry->d_fsdata;
321 }
322 
323 static inline struct cftype *__d_cft(struct dentry *dentry)
324 {
325  return __d_cfe(dentry)->type;
326 }
327 
328 /* the list of cgroups eligible for automatic release. Protected by
329  * release_list_lock */
330 static LIST_HEAD(release_list);
331 static DEFINE_RAW_SPINLOCK(release_list_lock);
332 static void cgroup_release_agent(struct work_struct *work);
333 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
334 static void check_for_release(struct cgroup *cgrp);
335 
336 /* Link structure for associating css_set objects with cgroups */
338  /*
339  * List running through cg_cgroup_links associated with a
340  * cgroup, anchored on cgroup->css_sets
341  */
343  struct cgroup *cgrp;
344  /*
345  * List running through cg_cgroup_links pointing at a
346  * single css_set object, anchored on css_set->cg_links
347  */
349  struct css_set *cg;
350 };
351 
352 /* The default css_set - used by init and its children prior to any
353  * hierarchies being mounted. It contains a pointer to the root state
354  * for each subsystem. Also used to anchor the list of css_sets. Not
355  * reference-counted, to improve performance when child cgroups
356  * haven't been created.
357  */
358 
359 static struct css_set init_css_set;
360 static struct cg_cgroup_link init_css_set_link;
361 
362 static int cgroup_init_idr(struct cgroup_subsys *ss,
363  struct cgroup_subsys_state *css);
364 
365 /* css_set_lock protects the list of css_set objects, and the
366  * chain of tasks off each css_set. Nests outside task->alloc_lock
367  * due to cgroup_iter_start() */
368 static DEFINE_RWLOCK(css_set_lock);
369 static int css_set_count;
370 
371 /*
372  * hash table for cgroup groups. This improves the performance to find
373  * an existing css_set. This hash doesn't (currently) take into
374  * account cgroups in empty hierarchies.
375  */
376 #define CSS_SET_HASH_BITS 7
377 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
378 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
379 
380 static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
381 {
382  int i;
383  int index;
384  unsigned long tmp = 0UL;
385 
386  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
387  tmp += (unsigned long)css[i];
388  tmp = (tmp >> 16) ^ tmp;
389 
390  index = hash_long(tmp, CSS_SET_HASH_BITS);
391 
392  return &css_set_table[index];
393 }
394 
395 /* We don't maintain the lists running through each css_set to its
396  * task until after the first call to cgroup_iter_start(). This
397  * reduces the fork()/exit() overhead for people who have cgroups
398  * compiled into their kernel but not actually in use */
399 static int use_task_css_set_links __read_mostly;
400 
401 static void __put_css_set(struct css_set *cg, int taskexit)
402 {
403  struct cg_cgroup_link *link;
404  struct cg_cgroup_link *saved_link;
405  /*
406  * Ensure that the refcount doesn't hit zero while any readers
407  * can see it. Similar to atomic_dec_and_lock(), but for an
408  * rwlock
409  */
410  if (atomic_add_unless(&cg->refcount, -1, 1))
411  return;
412  write_lock(&css_set_lock);
413  if (!atomic_dec_and_test(&cg->refcount)) {
414  write_unlock(&css_set_lock);
415  return;
416  }
417 
418  /* This css_set is dead. unlink it and release cgroup refcounts */
419  hlist_del(&cg->hlist);
420  css_set_count--;
421 
422  list_for_each_entry_safe(link, saved_link, &cg->cg_links,
423  cg_link_list) {
424  struct cgroup *cgrp = link->cgrp;
425  list_del(&link->cg_link_list);
426  list_del(&link->cgrp_link_list);
427  if (atomic_dec_and_test(&cgrp->count) &&
428  notify_on_release(cgrp)) {
429  if (taskexit)
430  set_bit(CGRP_RELEASABLE, &cgrp->flags);
431  check_for_release(cgrp);
432  }
433 
434  kfree(link);
435  }
436 
437  write_unlock(&css_set_lock);
438  kfree_rcu(cg, rcu_head);
439 }
440 
441 /*
442  * refcounted get/put for css_set objects
443  */
444 static inline void get_css_set(struct css_set *cg)
445 {
446  atomic_inc(&cg->refcount);
447 }
448 
449 static inline void put_css_set(struct css_set *cg)
450 {
451  __put_css_set(cg, 0);
452 }
453 
454 static inline void put_css_set_taskexit(struct css_set *cg)
455 {
456  __put_css_set(cg, 1);
457 }
458 
459 /*
460  * compare_css_sets - helper function for find_existing_css_set().
461  * @cg: candidate css_set being tested
462  * @old_cg: existing css_set for a task
463  * @new_cgrp: cgroup that's being entered by the task
464  * @template: desired set of css pointers in css_set (pre-calculated)
465  *
466  * Returns true if "cg" matches "old_cg" except for the hierarchy
467  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
468  */
469 static bool compare_css_sets(struct css_set *cg,
470  struct css_set *old_cg,
471  struct cgroup *new_cgrp,
472  struct cgroup_subsys_state *template[])
473 {
474  struct list_head *l1, *l2;
475 
476  if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
477  /* Not all subsystems matched */
478  return false;
479  }
480 
481  /*
482  * Compare cgroup pointers in order to distinguish between
483  * different cgroups in heirarchies with no subsystems. We
484  * could get by with just this check alone (and skip the
485  * memcmp above) but on most setups the memcmp check will
486  * avoid the need for this more expensive check on almost all
487  * candidates.
488  */
489 
490  l1 = &cg->cg_links;
491  l2 = &old_cg->cg_links;
492  while (1) {
493  struct cg_cgroup_link *cgl1, *cgl2;
494  struct cgroup *cg1, *cg2;
495 
496  l1 = l1->next;
497  l2 = l2->next;
498  /* See if we reached the end - both lists are equal length. */
499  if (l1 == &cg->cg_links) {
500  BUG_ON(l2 != &old_cg->cg_links);
501  break;
502  } else {
503  BUG_ON(l2 == &old_cg->cg_links);
504  }
505  /* Locate the cgroups associated with these links. */
506  cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
507  cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
508  cg1 = cgl1->cgrp;
509  cg2 = cgl2->cgrp;
510  /* Hierarchies should be linked in the same order. */
511  BUG_ON(cg1->root != cg2->root);
512 
513  /*
514  * If this hierarchy is the hierarchy of the cgroup
515  * that's changing, then we need to check that this
516  * css_set points to the new cgroup; if it's any other
517  * hierarchy, then this css_set should point to the
518  * same cgroup as the old css_set.
519  */
520  if (cg1->root == new_cgrp->root) {
521  if (cg1 != new_cgrp)
522  return false;
523  } else {
524  if (cg1 != cg2)
525  return false;
526  }
527  }
528  return true;
529 }
530 
531 /*
532  * find_existing_css_set() is a helper for
533  * find_css_set(), and checks to see whether an existing
534  * css_set is suitable.
535  *
536  * oldcg: the cgroup group that we're using before the cgroup
537  * transition
538  *
539  * cgrp: the cgroup that we're moving into
540  *
541  * template: location in which to build the desired set of subsystem
542  * state objects for the new cgroup group
543  */
544 static struct css_set *find_existing_css_set(
545  struct css_set *oldcg,
546  struct cgroup *cgrp,
547  struct cgroup_subsys_state *template[])
548 {
549  int i;
550  struct cgroupfs_root *root = cgrp->root;
551  struct hlist_head *hhead;
552  struct hlist_node *node;
553  struct css_set *cg;
554 
555  /*
556  * Build the set of subsystem state objects that we want to see in the
557  * new css_set. while subsystems can change globally, the entries here
558  * won't change, so no need for locking.
559  */
560  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
561  if (root->subsys_mask & (1UL << i)) {
562  /* Subsystem is in this hierarchy. So we want
563  * the subsystem state from the new
564  * cgroup */
565  template[i] = cgrp->subsys[i];
566  } else {
567  /* Subsystem is not in this hierarchy, so we
568  * don't want to change the subsystem state */
569  template[i] = oldcg->subsys[i];
570  }
571  }
572 
573  hhead = css_set_hash(template);
574  hlist_for_each_entry(cg, node, hhead, hlist) {
575  if (!compare_css_sets(cg, oldcg, cgrp, template))
576  continue;
577 
578  /* This css_set matches what we need */
579  return cg;
580  }
581 
582  /* No existing cgroup group matched */
583  return NULL;
584 }
585 
586 static void free_cg_links(struct list_head *tmp)
587 {
588  struct cg_cgroup_link *link;
589  struct cg_cgroup_link *saved_link;
590 
591  list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
592  list_del(&link->cgrp_link_list);
593  kfree(link);
594  }
595 }
596 
597 /*
598  * allocate_cg_links() allocates "count" cg_cgroup_link structures
599  * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
600  * success or a negative error
601  */
602 static int allocate_cg_links(int count, struct list_head *tmp)
603 {
604  struct cg_cgroup_link *link;
605  int i;
606  INIT_LIST_HEAD(tmp);
607  for (i = 0; i < count; i++) {
608  link = kmalloc(sizeof(*link), GFP_KERNEL);
609  if (!link) {
610  free_cg_links(tmp);
611  return -ENOMEM;
612  }
613  list_add(&link->cgrp_link_list, tmp);
614  }
615  return 0;
616 }
617 
624 static void link_css_set(struct list_head *tmp_cg_links,
625  struct css_set *cg, struct cgroup *cgrp)
626 {
627  struct cg_cgroup_link *link;
628 
629  BUG_ON(list_empty(tmp_cg_links));
630  link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
631  cgrp_link_list);
632  link->cg = cg;
633  link->cgrp = cgrp;
634  atomic_inc(&cgrp->count);
635  list_move(&link->cgrp_link_list, &cgrp->css_sets);
636  /*
637  * Always add links to the tail of the list so that the list
638  * is sorted by order of hierarchy creation
639  */
640  list_add_tail(&link->cg_link_list, &cg->cg_links);
641 }
642 
643 /*
644  * find_css_set() takes an existing cgroup group and a
645  * cgroup object, and returns a css_set object that's
646  * equivalent to the old group, but with the given cgroup
647  * substituted into the appropriate hierarchy. Must be called with
648  * cgroup_mutex held
649  */
650 static struct css_set *find_css_set(
651  struct css_set *oldcg, struct cgroup *cgrp)
652 {
653  struct css_set *res;
654  struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
655 
656  struct list_head tmp_cg_links;
657 
658  struct hlist_head *hhead;
659  struct cg_cgroup_link *link;
660 
661  /* First see if we already have a cgroup group that matches
662  * the desired set */
663  read_lock(&css_set_lock);
664  res = find_existing_css_set(oldcg, cgrp, template);
665  if (res)
666  get_css_set(res);
667  read_unlock(&css_set_lock);
668 
669  if (res)
670  return res;
671 
672  res = kmalloc(sizeof(*res), GFP_KERNEL);
673  if (!res)
674  return NULL;
675 
676  /* Allocate all the cg_cgroup_link objects that we'll need */
677  if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
678  kfree(res);
679  return NULL;
680  }
681 
682  atomic_set(&res->refcount, 1);
683  INIT_LIST_HEAD(&res->cg_links);
684  INIT_LIST_HEAD(&res->tasks);
685  INIT_HLIST_NODE(&res->hlist);
686 
687  /* Copy the set of subsystem state objects generated in
688  * find_existing_css_set() */
689  memcpy(res->subsys, template, sizeof(res->subsys));
690 
691  write_lock(&css_set_lock);
692  /* Add reference counts and links from the new css_set. */
693  list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
694  struct cgroup *c = link->cgrp;
695  if (c->root == cgrp->root)
696  c = cgrp;
697  link_css_set(&tmp_cg_links, res, c);
698  }
699 
700  BUG_ON(!list_empty(&tmp_cg_links));
701 
702  css_set_count++;
703 
704  /* Add this cgroup group to the hash table */
705  hhead = css_set_hash(res->subsys);
706  hlist_add_head(&res->hlist, hhead);
707 
708  write_unlock(&css_set_lock);
709 
710  return res;
711 }
712 
713 /*
714  * Return the cgroup for "task" from the given hierarchy. Must be
715  * called with cgroup_mutex held.
716  */
717 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
718  struct cgroupfs_root *root)
719 {
720  struct css_set *css;
721  struct cgroup *res = NULL;
722 
723  BUG_ON(!mutex_is_locked(&cgroup_mutex));
724  read_lock(&css_set_lock);
725  /*
726  * No need to lock the task - since we hold cgroup_mutex the
727  * task can't change groups, so the only thing that can happen
728  * is that it exits and its css is set back to init_css_set.
729  */
730  css = task->cgroups;
731  if (css == &init_css_set) {
732  res = &root->top_cgroup;
733  } else {
734  struct cg_cgroup_link *link;
735  list_for_each_entry(link, &css->cg_links, cg_link_list) {
736  struct cgroup *c = link->cgrp;
737  if (c->root == root) {
738  res = c;
739  break;
740  }
741  }
742  }
743  read_unlock(&css_set_lock);
744  BUG_ON(!res);
745  return res;
746 }
747 
748 /*
749  * There is one global cgroup mutex. We also require taking
750  * task_lock() when dereferencing a task's cgroup subsys pointers.
751  * See "The task_lock() exception", at the end of this comment.
752  *
753  * A task must hold cgroup_mutex to modify cgroups.
754  *
755  * Any task can increment and decrement the count field without lock.
756  * So in general, code holding cgroup_mutex can't rely on the count
757  * field not changing. However, if the count goes to zero, then only
758  * cgroup_attach_task() can increment it again. Because a count of zero
759  * means that no tasks are currently attached, therefore there is no
760  * way a task attached to that cgroup can fork (the other way to
761  * increment the count). So code holding cgroup_mutex can safely
762  * assume that if the count is zero, it will stay zero. Similarly, if
763  * a task holds cgroup_mutex on a cgroup with zero count, it
764  * knows that the cgroup won't be removed, as cgroup_rmdir()
765  * needs that mutex.
766  *
767  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
768  * (usually) take cgroup_mutex. These are the two most performance
769  * critical pieces of code here. The exception occurs on cgroup_exit(),
770  * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
771  * is taken, and if the cgroup count is zero, a usermode call made
772  * to the release agent with the name of the cgroup (path relative to
773  * the root of cgroup file system) as the argument.
774  *
775  * A cgroup can only be deleted if both its 'count' of using tasks
776  * is zero, and its list of 'children' cgroups is empty. Since all
777  * tasks in the system use _some_ cgroup, and since there is always at
778  * least one task in the system (init, pid == 1), therefore, top_cgroup
779  * always has either children cgroups and/or using tasks. So we don't
780  * need a special hack to ensure that top_cgroup cannot be deleted.
781  *
782  * The task_lock() exception
783  *
784  * The need for this exception arises from the action of
785  * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
786  * another. It does so using cgroup_mutex, however there are
787  * several performance critical places that need to reference
788  * task->cgroup without the expense of grabbing a system global
789  * mutex. Therefore except as noted below, when dereferencing or, as
790  * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
791  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792  * the task_struct routinely used for such matters.
793  *
794  * P.S. One more locking exception. RCU is used to guard the
795  * update of a tasks cgroup pointer by cgroup_attach_task()
796  */
797 
802 void cgroup_lock(void)
803 {
804  mutex_lock(&cgroup_mutex);
805 }
807 
813 void cgroup_unlock(void)
814 {
815  mutex_unlock(&cgroup_mutex);
816 }
818 
819 /*
820  * A couple of forward declarations required, due to cyclic reference loop:
821  * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
822  * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
823  * -> cgroup_mkdir.
824  */
825 
826 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
827 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
828 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
829 static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
830  unsigned long subsys_mask);
831 static const struct inode_operations cgroup_dir_inode_operations;
832 static const struct file_operations proc_cgroupstats_operations;
833 
834 static struct backing_dev_info cgroup_backing_dev_info = {
835  .name = "cgroup",
836  .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
837 };
838 
839 static int alloc_css_id(struct cgroup_subsys *ss,
840  struct cgroup *parent, struct cgroup *child);
841 
842 static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
843 {
844  struct inode *inode = new_inode(sb);
845 
846  if (inode) {
847  inode->i_ino = get_next_ino();
848  inode->i_mode = mode;
849  inode->i_uid = current_fsuid();
850  inode->i_gid = current_fsgid();
851  inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
852  inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
853  }
854  return inode;
855 }
856 
857 /*
858  * Call subsys's pre_destroy handler.
859  * This is called before css refcnt check.
860  */
861 static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862 {
863  struct cgroup_subsys *ss;
864  int ret = 0;
865 
866  for_each_subsys(cgrp->root, ss) {
867  if (!ss->pre_destroy)
868  continue;
869 
870  ret = ss->pre_destroy(cgrp);
871  if (ret) {
872  /* ->pre_destroy() failure is being deprecated */
873  WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874  break;
875  }
876  }
877 
878  return ret;
879 }
880 
881 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 {
883  /* is dentry a directory ? if so, kfree() associated cgroup */
884  if (S_ISDIR(inode->i_mode)) {
885  struct cgroup *cgrp = dentry->d_fsdata;
886  struct cgroup_subsys *ss;
887  BUG_ON(!(cgroup_is_removed(cgrp)));
888  /* It's possible for external users to be holding css
889  * reference counts on a cgroup; css_put() needs to
890  * be able to access the cgroup after decrementing
891  * the reference count in order to know if it needs to
892  * queue the cgroup to be handled by the release
893  * agent */
894  synchronize_rcu();
895 
896  mutex_lock(&cgroup_mutex);
897  /*
898  * Release the subsystem state objects.
899  */
900  for_each_subsys(cgrp->root, ss)
901  ss->destroy(cgrp);
902 
903  cgrp->root->number_of_cgroups--;
904  mutex_unlock(&cgroup_mutex);
905 
906  /*
907  * Drop the active superblock reference that we took when we
908  * created the cgroup
909  */
910  deactivate_super(cgrp->root->sb);
911 
912  /*
913  * if we're getting rid of the cgroup, refcount should ensure
914  * that there are no pidlists left.
915  */
916  BUG_ON(!list_empty(&cgrp->pidlists));
917 
918  simple_xattrs_free(&cgrp->xattrs);
919 
920  kfree_rcu(cgrp, rcu_head);
921  } else {
922  struct cfent *cfe = __d_cfe(dentry);
923  struct cgroup *cgrp = dentry->d_parent->d_fsdata;
924  struct cftype *cft = cfe->type;
925 
926  WARN_ONCE(!list_empty(&cfe->node) &&
927  cgrp != &cgrp->root->top_cgroup,
928  "cfe still linked for %s\n", cfe->type->name);
929  kfree(cfe);
930  simple_xattrs_free(&cft->xattrs);
931  }
932  iput(inode);
933 }
934 
935 static int cgroup_delete(const struct dentry *d)
936 {
937  return 1;
938 }
939 
940 static void remove_dir(struct dentry *d)
941 {
942  struct dentry *parent = dget(d->d_parent);
943 
944  d_delete(d);
945  simple_rmdir(parent->d_inode, d);
946  dput(parent);
947 }
948 
949 static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
950 {
951  struct cfent *cfe;
952 
953  lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
954  lockdep_assert_held(&cgroup_mutex);
955 
956  list_for_each_entry(cfe, &cgrp->files, node) {
957  struct dentry *d = cfe->dentry;
958 
959  if (cft && cfe->type != cft)
960  continue;
961 
962  dget(d);
963  d_delete(d);
964  simple_unlink(cgrp->dentry->d_inode, d);
965  list_del_init(&cfe->node);
966  dput(d);
967 
968  return 0;
969  }
970  return -ENOENT;
971 }
972 
979 static void cgroup_clear_directory(struct dentry *dir, bool base_files,
980  unsigned long subsys_mask)
981 {
982  struct cgroup *cgrp = __d_cgrp(dir);
983  struct cgroup_subsys *ss;
984 
985  for_each_subsys(cgrp->root, ss) {
986  struct cftype_set *set;
987  if (!test_bit(ss->subsys_id, &subsys_mask))
988  continue;
989  list_for_each_entry(set, &ss->cftsets, node)
990  cgroup_rm_file(cgrp, set->cfts);
991  }
992  if (base_files) {
993  while (!list_empty(&cgrp->files))
994  cgroup_rm_file(cgrp, NULL);
995  }
996 }
997 
998 /*
999  * NOTE : the dentry must have been dget()'ed
1000  */
1001 static void cgroup_d_remove_dir(struct dentry *dentry)
1002 {
1003  struct dentry *parent;
1004  struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1005 
1006  cgroup_clear_directory(dentry, true, root->subsys_mask);
1007 
1008  parent = dentry->d_parent;
1009  spin_lock(&parent->d_lock);
1011  list_del_init(&dentry->d_u.d_child);
1012  spin_unlock(&dentry->d_lock);
1013  spin_unlock(&parent->d_lock);
1014  remove_dir(dentry);
1015 }
1016 
1017 /*
1018  * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019  * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020  * reference to css->refcnt. In general, this refcnt is expected to goes down
1021  * to zero, soon.
1022  *
1023  * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024  */
1025 static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026 
1027 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028 {
1029  if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030  wake_up_all(&cgroup_rmdir_waitq);
1031 }
1032 
1033 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034 {
1035  css_get(css);
1036 }
1037 
1038 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039 {
1040  cgroup_wakeup_rmdir_waiter(css->cgroup);
1041  css_put(css);
1042 }
1043 
1044 /*
1045  * Call with cgroup_mutex held. Drops reference counts on modules, including
1046  * any duplicate ones that parse_cgroupfs_options took. If this function
1047  * returns an error, no reference counts are touched.
1048  */
1049 static int rebind_subsystems(struct cgroupfs_root *root,
1050  unsigned long final_subsys_mask)
1051 {
1052  unsigned long added_mask, removed_mask;
1053  struct cgroup *cgrp = &root->top_cgroup;
1054  int i;
1055 
1056  BUG_ON(!mutex_is_locked(&cgroup_mutex));
1057  BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1058 
1059  removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1060  added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1061  /* Check that any added subsystems are currently free */
1062  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1063  unsigned long bit = 1UL << i;
1064  struct cgroup_subsys *ss = subsys[i];
1065  if (!(bit & added_mask))
1066  continue;
1067  /*
1068  * Nobody should tell us to do a subsys that doesn't exist:
1069  * parse_cgroupfs_options should catch that case and refcounts
1070  * ensure that subsystems won't disappear once selected.
1071  */
1072  BUG_ON(ss == NULL);
1073  if (ss->root != &rootnode) {
1074  /* Subsystem isn't free */
1075  return -EBUSY;
1076  }
1077  }
1078 
1079  /* Currently we don't handle adding/removing subsystems when
1080  * any child cgroups exist. This is theoretically supportable
1081  * but involves complex error handling, so it's being left until
1082  * later */
1083  if (root->number_of_cgroups > 1)
1084  return -EBUSY;
1085 
1086  /* Process each subsystem */
1087  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1088  struct cgroup_subsys *ss = subsys[i];
1089  unsigned long bit = 1UL << i;
1090  if (bit & added_mask) {
1091  /* We're binding this subsystem to this hierarchy */
1092  BUG_ON(ss == NULL);
1093  BUG_ON(cgrp->subsys[i]);
1094  BUG_ON(!dummytop->subsys[i]);
1095  BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
1096  cgrp->subsys[i] = dummytop->subsys[i];
1097  cgrp->subsys[i]->cgroup = cgrp;
1098  list_move(&ss->sibling, &root->subsys_list);
1099  ss->root = root;
1100  if (ss->bind)
1101  ss->bind(cgrp);
1102  /* refcount was already taken, and we're keeping it */
1103  } else if (bit & removed_mask) {
1104  /* We're removing this subsystem */
1105  BUG_ON(ss == NULL);
1106  BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1107  BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1108  if (ss->bind)
1109  ss->bind(dummytop);
1110  dummytop->subsys[i]->cgroup = dummytop;
1111  cgrp->subsys[i] = NULL;
1112  subsys[i]->root = &rootnode;
1113  list_move(&ss->sibling, &rootnode.subsys_list);
1114  /* subsystem is now free - drop reference on module */
1115  module_put(ss->module);
1116  } else if (bit & final_subsys_mask) {
1117  /* Subsystem state should already exist */
1118  BUG_ON(ss == NULL);
1119  BUG_ON(!cgrp->subsys[i]);
1120  /*
1121  * a refcount was taken, but we already had one, so
1122  * drop the extra reference.
1123  */
1124  module_put(ss->module);
1125 #ifdef CONFIG_MODULE_UNLOAD
1126  BUG_ON(ss->module && !module_refcount(ss->module));
1127 #endif
1128  } else {
1129  /* Subsystem state shouldn't exist */
1130  BUG_ON(cgrp->subsys[i]);
1131  }
1132  }
1133  root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1134  synchronize_rcu();
1135 
1136  return 0;
1137 }
1138 
1139 static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1140 {
1141  struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1142  struct cgroup_subsys *ss;
1143 
1144  mutex_lock(&cgroup_root_mutex);
1145  for_each_subsys(root, ss)
1146  seq_printf(seq, ",%s", ss->name);
1147  if (test_bit(ROOT_NOPREFIX, &root->flags))
1148  seq_puts(seq, ",noprefix");
1149  if (test_bit(ROOT_XATTR, &root->flags))
1150  seq_puts(seq, ",xattr");
1151  if (strlen(root->release_agent_path))
1152  seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153  if (clone_children(&root->top_cgroup))
1154  seq_puts(seq, ",clone_children");
1155  if (strlen(root->name))
1156  seq_printf(seq, ",name=%s", root->name);
1157  mutex_unlock(&cgroup_root_mutex);
1158  return 0;
1159 }
1160 
1162  unsigned long subsys_mask;
1163  unsigned long flags;
1166  char *name;
1167  /* User explicitly requested empty subsystem */
1168  bool none;
1169 
1171 
1172 };
1173 
1174 /*
1175  * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1176  * with cgroup_mutex held to protect the subsys[] array. This function takes
1177  * refcounts on subsystems to be used, unless it returns error, in which case
1178  * no refcounts are taken.
1179  */
1180 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 {
1182  char *token, *o = data;
1183  bool all_ss = false, one_ss = false;
1184  unsigned long mask = (unsigned long)-1;
1185  int i;
1186  bool module_pin_failed = false;
1187 
1188  BUG_ON(!mutex_is_locked(&cgroup_mutex));
1189 
1190 #ifdef CONFIG_CPUSETS
1191  mask = ~(1UL << cpuset_subsys_id);
1192 #endif
1193 
1194  memset(opts, 0, sizeof(*opts));
1195 
1196  while ((token = strsep(&o, ",")) != NULL) {
1197  if (!*token)
1198  return -EINVAL;
1199  if (!strcmp(token, "none")) {
1200  /* Explicitly have no subsystems */
1201  opts->none = true;
1202  continue;
1203  }
1204  if (!strcmp(token, "all")) {
1205  /* Mutually exclusive option 'all' + subsystem name */
1206  if (one_ss)
1207  return -EINVAL;
1208  all_ss = true;
1209  continue;
1210  }
1211  if (!strcmp(token, "noprefix")) {
1212  set_bit(ROOT_NOPREFIX, &opts->flags);
1213  continue;
1214  }
1215  if (!strcmp(token, "clone_children")) {
1216  opts->clone_children = true;
1217  continue;
1218  }
1219  if (!strcmp(token, "xattr")) {
1220  set_bit(ROOT_XATTR, &opts->flags);
1221  continue;
1222  }
1223  if (!strncmp(token, "release_agent=", 14)) {
1224  /* Specifying two release agents is forbidden */
1225  if (opts->release_agent)
1226  return -EINVAL;
1227  opts->release_agent =
1228  kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1229  if (!opts->release_agent)
1230  return -ENOMEM;
1231  continue;
1232  }
1233  if (!strncmp(token, "name=", 5)) {
1234  const char *name = token + 5;
1235  /* Can't specify an empty name */
1236  if (!strlen(name))
1237  return -EINVAL;
1238  /* Must match [\w.-]+ */
1239  for (i = 0; i < strlen(name); i++) {
1240  char c = name[i];
1241  if (isalnum(c))
1242  continue;
1243  if ((c == '.') || (c == '-') || (c == '_'))
1244  continue;
1245  return -EINVAL;
1246  }
1247  /* Specifying two names is forbidden */
1248  if (opts->name)
1249  return -EINVAL;
1250  opts->name = kstrndup(name,
1252  GFP_KERNEL);
1253  if (!opts->name)
1254  return -ENOMEM;
1255 
1256  continue;
1257  }
1258 
1259  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1260  struct cgroup_subsys *ss = subsys[i];
1261  if (ss == NULL)
1262  continue;
1263  if (strcmp(token, ss->name))
1264  continue;
1265  if (ss->disabled)
1266  continue;
1267 
1268  /* Mutually exclusive option 'all' + subsystem name */
1269  if (all_ss)
1270  return -EINVAL;
1271  set_bit(i, &opts->subsys_mask);
1272  one_ss = true;
1273 
1274  break;
1275  }
1276  if (i == CGROUP_SUBSYS_COUNT)
1277  return -ENOENT;
1278  }
1279 
1280  /*
1281  * If the 'all' option was specified select all the subsystems,
1282  * otherwise if 'none', 'name=' and a subsystem name options
1283  * were not specified, let's default to 'all'
1284  */
1285  if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1286  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1287  struct cgroup_subsys *ss = subsys[i];
1288  if (ss == NULL)
1289  continue;
1290  if (ss->disabled)
1291  continue;
1292  set_bit(i, &opts->subsys_mask);
1293  }
1294  }
1295 
1296  /* Consistency checks */
1297 
1298  /*
1299  * Option noprefix was introduced just for backward compatibility
1300  * with the old cpuset, so we allow noprefix only if mounting just
1301  * the cpuset subsystem.
1302  */
1303  if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1304  (opts->subsys_mask & mask))
1305  return -EINVAL;
1306 
1307 
1308  /* Can't specify "none" and some subsystems */
1309  if (opts->subsys_mask && opts->none)
1310  return -EINVAL;
1311 
1312  /*
1313  * We either have to specify by name or by subsystems. (So all
1314  * empty hierarchies must have a name).
1315  */
1316  if (!opts->subsys_mask && !opts->name)
1317  return -EINVAL;
1318 
1319  /*
1320  * Grab references on all the modules we'll need, so the subsystems
1321  * don't dance around before rebind_subsystems attaches them. This may
1322  * take duplicate reference counts on a subsystem that's already used,
1323  * but rebind_subsystems handles this case.
1324  */
1325  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1326  unsigned long bit = 1UL << i;
1327 
1328  if (!(bit & opts->subsys_mask))
1329  continue;
1330  if (!try_module_get(subsys[i]->module)) {
1331  module_pin_failed = true;
1332  break;
1333  }
1334  }
1335  if (module_pin_failed) {
1336  /*
1337  * oops, one of the modules was going away. this means that we
1338  * raced with a module_delete call, and to the user this is
1339  * essentially a "subsystem doesn't exist" case.
1340  */
1341  for (i--; i >= 0; i--) {
1342  /* drop refcounts only on the ones we took */
1343  unsigned long bit = 1UL << i;
1344 
1345  if (!(bit & opts->subsys_mask))
1346  continue;
1347  module_put(subsys[i]->module);
1348  }
1349  return -ENOENT;
1350  }
1351 
1352  return 0;
1353 }
1354 
1355 static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1356 {
1357  int i;
1358  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1359  unsigned long bit = 1UL << i;
1360 
1361  if (!(bit & subsys_mask))
1362  continue;
1363  module_put(subsys[i]->module);
1364  }
1365 }
1366 
1367 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1368 {
1369  int ret = 0;
1370  struct cgroupfs_root *root = sb->s_fs_info;
1371  struct cgroup *cgrp = &root->top_cgroup;
1372  struct cgroup_sb_opts opts;
1373  unsigned long added_mask, removed_mask;
1374 
1375  mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1376  mutex_lock(&cgroup_mutex);
1377  mutex_lock(&cgroup_root_mutex);
1378 
1379  /* See what subsystems are wanted */
1380  ret = parse_cgroupfs_options(data, &opts);
1381  if (ret)
1382  goto out_unlock;
1383 
1384  /* See feature-removal-schedule.txt */
1385  if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1386  pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1387  task_tgid_nr(current), current->comm);
1388 
1389  added_mask = opts.subsys_mask & ~root->subsys_mask;
1390  removed_mask = root->subsys_mask & ~opts.subsys_mask;
1391 
1392  /* Don't allow flags or name to change at remount */
1393  if (opts.flags != root->flags ||
1394  (opts.name && strcmp(opts.name, root->name))) {
1395  ret = -EINVAL;
1396  drop_parsed_module_refcounts(opts.subsys_mask);
1397  goto out_unlock;
1398  }
1399 
1400  ret = rebind_subsystems(root, opts.subsys_mask);
1401  if (ret) {
1402  drop_parsed_module_refcounts(opts.subsys_mask);
1403  goto out_unlock;
1404  }
1405 
1406  /* clear out any existing files and repopulate subsystem files */
1407  cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408  /* re-populate subsystem files */
1409  cgroup_populate_dir(cgrp, false, added_mask);
1410 
1411  if (opts.release_agent)
1413  out_unlock:
1414  kfree(opts.release_agent);
1415  kfree(opts.name);
1416  mutex_unlock(&cgroup_root_mutex);
1417  mutex_unlock(&cgroup_mutex);
1418  mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1419  return ret;
1420 }
1421 
1422 static const struct super_operations cgroup_ops = {
1423  .statfs = simple_statfs,
1424  .drop_inode = generic_delete_inode,
1425  .show_options = cgroup_show_options,
1426  .remount_fs = cgroup_remount,
1427 };
1428 
1429 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1430 {
1431  INIT_LIST_HEAD(&cgrp->sibling);
1432  INIT_LIST_HEAD(&cgrp->children);
1433  INIT_LIST_HEAD(&cgrp->files);
1434  INIT_LIST_HEAD(&cgrp->css_sets);
1435  INIT_LIST_HEAD(&cgrp->release_list);
1436  INIT_LIST_HEAD(&cgrp->pidlists);
1437  mutex_init(&cgrp->pidlist_mutex);
1438  INIT_LIST_HEAD(&cgrp->event_list);
1439  spin_lock_init(&cgrp->event_list_lock);
1440  simple_xattrs_init(&cgrp->xattrs);
1441 }
1442 
1443 static void init_cgroup_root(struct cgroupfs_root *root)
1444 {
1445  struct cgroup *cgrp = &root->top_cgroup;
1446 
1447  INIT_LIST_HEAD(&root->subsys_list);
1448  INIT_LIST_HEAD(&root->root_list);
1449  INIT_LIST_HEAD(&root->allcg_list);
1450  root->number_of_cgroups = 1;
1451  cgrp->root = root;
1452  cgrp->top_cgroup = cgrp;
1453  list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454  init_cgroup_housekeeping(cgrp);
1455 }
1456 
1457 static bool init_root_id(struct cgroupfs_root *root)
1458 {
1459  int ret = 0;
1460 
1461  do {
1462  if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1463  return false;
1464  spin_lock(&hierarchy_id_lock);
1465  /* Try to allocate the next unused ID */
1466  ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1467  &root->hierarchy_id);
1468  if (ret == -ENOSPC)
1469  /* Try again starting from 0 */
1470  ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1471  if (!ret) {
1472  next_hierarchy_id = root->hierarchy_id + 1;
1473  } else if (ret != -EAGAIN) {
1474  /* Can only get here if the 31-bit IDR is full ... */
1475  BUG_ON(ret);
1476  }
1477  spin_unlock(&hierarchy_id_lock);
1478  } while (ret);
1479  return true;
1480 }
1481 
1482 static int cgroup_test_super(struct super_block *sb, void *data)
1483 {
1484  struct cgroup_sb_opts *opts = data;
1485  struct cgroupfs_root *root = sb->s_fs_info;
1486 
1487  /* If we asked for a name then it must match */
1488  if (opts->name && strcmp(opts->name, root->name))
1489  return 0;
1490 
1491  /*
1492  * If we asked for subsystems (or explicitly for no
1493  * subsystems) then they must match
1494  */
1495  if ((opts->subsys_mask || opts->none)
1496  && (opts->subsys_mask != root->subsys_mask))
1497  return 0;
1498 
1499  return 1;
1500 }
1501 
1502 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 {
1504  struct cgroupfs_root *root;
1505 
1506  if (!opts->subsys_mask && !opts->none)
1507  return NULL;
1508 
1509  root = kzalloc(sizeof(*root), GFP_KERNEL);
1510  if (!root)
1511  return ERR_PTR(-ENOMEM);
1512 
1513  if (!init_root_id(root)) {
1514  kfree(root);
1515  return ERR_PTR(-ENOMEM);
1516  }
1517  init_cgroup_root(root);
1518 
1519  root->subsys_mask = opts->subsys_mask;
1520  root->flags = opts->flags;
1521  if (opts->release_agent)
1522  strcpy(root->release_agent_path, opts->release_agent);
1523  if (opts->name)
1524  strcpy(root->name, opts->name);
1525  if (opts->clone_children)
1526  set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1527  return root;
1528 }
1529 
1530 static void cgroup_drop_root(struct cgroupfs_root *root)
1531 {
1532  if (!root)
1533  return;
1534 
1535  BUG_ON(!root->hierarchy_id);
1536  spin_lock(&hierarchy_id_lock);
1537  ida_remove(&hierarchy_ida, root->hierarchy_id);
1538  spin_unlock(&hierarchy_id_lock);
1539  kfree(root);
1540 }
1541 
1542 static int cgroup_set_super(struct super_block *sb, void *data)
1543 {
1544  int ret;
1545  struct cgroup_sb_opts *opts = data;
1546 
1547  /* If we don't have a new root, we can't set up a new sb */
1548  if (!opts->new_root)
1549  return -EINVAL;
1550 
1551  BUG_ON(!opts->subsys_mask && !opts->none);
1552 
1553  ret = set_anon_super(sb, NULL);
1554  if (ret)
1555  return ret;
1556 
1557  sb->s_fs_info = opts->new_root;
1558  opts->new_root->sb = sb;
1559 
1563  sb->s_op = &cgroup_ops;
1564 
1565  return 0;
1566 }
1567 
1568 static int cgroup_get_rootdir(struct super_block *sb)
1569 {
1570  static const struct dentry_operations cgroup_dops = {
1571  .d_iput = cgroup_diput,
1572  .d_delete = cgroup_delete,
1573  };
1574 
1575  struct inode *inode =
1576  cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1577 
1578  if (!inode)
1579  return -ENOMEM;
1580 
1581  inode->i_fop = &simple_dir_operations;
1582  inode->i_op = &cgroup_dir_inode_operations;
1583  /* directories start off with i_nlink == 2 (for "." entry) */
1584  inc_nlink(inode);
1585  sb->s_root = d_make_root(inode);
1586  if (!sb->s_root)
1587  return -ENOMEM;
1588  /* for everything else we want ->d_op set */
1589  sb->s_d_op = &cgroup_dops;
1590  return 0;
1591 }
1592 
1593 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1594  int flags, const char *unused_dev_name,
1595  void *data)
1596 {
1597  struct cgroup_sb_opts opts;
1598  struct cgroupfs_root *root;
1599  int ret = 0;
1600  struct super_block *sb;
1601  struct cgroupfs_root *new_root;
1602  struct inode *inode;
1603 
1604  /* First find the desired set of subsystems */
1605  mutex_lock(&cgroup_mutex);
1606  ret = parse_cgroupfs_options(data, &opts);
1607  mutex_unlock(&cgroup_mutex);
1608  if (ret)
1609  goto out_err;
1610 
1611  /*
1612  * Allocate a new cgroup root. We may not need it if we're
1613  * reusing an existing hierarchy.
1614  */
1615  new_root = cgroup_root_from_opts(&opts);
1616  if (IS_ERR(new_root)) {
1617  ret = PTR_ERR(new_root);
1618  goto drop_modules;
1619  }
1620  opts.new_root = new_root;
1621 
1622  /* Locate an existing or new sb for this hierarchy */
1623  sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1624  if (IS_ERR(sb)) {
1625  ret = PTR_ERR(sb);
1626  cgroup_drop_root(opts.new_root);
1627  goto drop_modules;
1628  }
1629 
1630  root = sb->s_fs_info;
1631  BUG_ON(!root);
1632  if (root == opts.new_root) {
1633  /* We used the new root structure, so this is a new hierarchy */
1634  struct list_head tmp_cg_links;
1635  struct cgroup *root_cgrp = &root->top_cgroup;
1636  struct cgroupfs_root *existing_root;
1637  const struct cred *cred;
1638  int i;
1639 
1640  BUG_ON(sb->s_root != NULL);
1641 
1642  ret = cgroup_get_rootdir(sb);
1643  if (ret)
1644  goto drop_new_super;
1645  inode = sb->s_root->d_inode;
1646 
1647  mutex_lock(&inode->i_mutex);
1648  mutex_lock(&cgroup_mutex);
1649  mutex_lock(&cgroup_root_mutex);
1650 
1651  /* Check for name clashes with existing mounts */
1652  ret = -EBUSY;
1653  if (strlen(root->name))
1654  for_each_active_root(existing_root)
1655  if (!strcmp(existing_root->name, root->name))
1656  goto unlock_drop;
1657 
1658  /*
1659  * We're accessing css_set_count without locking
1660  * css_set_lock here, but that's OK - it can only be
1661  * increased by someone holding cgroup_lock, and
1662  * that's us. The worst that can happen is that we
1663  * have some link structures left over
1664  */
1665  ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1666  if (ret)
1667  goto unlock_drop;
1668 
1669  ret = rebind_subsystems(root, root->subsys_mask);
1670  if (ret == -EBUSY) {
1671  free_cg_links(&tmp_cg_links);
1672  goto unlock_drop;
1673  }
1674  /*
1675  * There must be no failure case after here, since rebinding
1676  * takes care of subsystems' refcounts, which are explicitly
1677  * dropped in the failure exit path.
1678  */
1679 
1680  /* EBUSY should be the only error here */
1681  BUG_ON(ret);
1682 
1683  list_add(&root->root_list, &roots);
1684  root_count++;
1685 
1686  sb->s_root->d_fsdata = root_cgrp;
1687  root->top_cgroup.dentry = sb->s_root;
1688 
1689  /* Link the top cgroup in this hierarchy into all
1690  * the css_set objects */
1691  write_lock(&css_set_lock);
1692  for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1693  struct hlist_head *hhead = &css_set_table[i];
1694  struct hlist_node *node;
1695  struct css_set *cg;
1696 
1697  hlist_for_each_entry(cg, node, hhead, hlist)
1698  link_css_set(&tmp_cg_links, cg, root_cgrp);
1699  }
1700  write_unlock(&css_set_lock);
1701 
1702  free_cg_links(&tmp_cg_links);
1703 
1704  BUG_ON(!list_empty(&root_cgrp->sibling));
1705  BUG_ON(!list_empty(&root_cgrp->children));
1706  BUG_ON(root->number_of_cgroups != 1);
1707 
1708  cred = override_creds(&init_cred);
1709  cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1710  revert_creds(cred);
1711  mutex_unlock(&cgroup_root_mutex);
1712  mutex_unlock(&cgroup_mutex);
1713  mutex_unlock(&inode->i_mutex);
1714  } else {
1715  /*
1716  * We re-used an existing hierarchy - the new root (if
1717  * any) is not needed
1718  */
1719  cgroup_drop_root(opts.new_root);
1720  /* no subsys rebinding, so refcounts don't change */
1721  drop_parsed_module_refcounts(opts.subsys_mask);
1722  }
1723 
1724  kfree(opts.release_agent);
1725  kfree(opts.name);
1726  return dget(sb->s_root);
1727 
1728  unlock_drop:
1729  mutex_unlock(&cgroup_root_mutex);
1730  mutex_unlock(&cgroup_mutex);
1731  mutex_unlock(&inode->i_mutex);
1732  drop_new_super:
1734  drop_modules:
1735  drop_parsed_module_refcounts(opts.subsys_mask);
1736  out_err:
1737  kfree(opts.release_agent);
1738  kfree(opts.name);
1739  return ERR_PTR(ret);
1740 }
1741 
1742 static void cgroup_kill_sb(struct super_block *sb) {
1743  struct cgroupfs_root *root = sb->s_fs_info;
1744  struct cgroup *cgrp = &root->top_cgroup;
1745  int ret;
1746  struct cg_cgroup_link *link;
1747  struct cg_cgroup_link *saved_link;
1748 
1749  BUG_ON(!root);
1750 
1751  BUG_ON(root->number_of_cgroups != 1);
1752  BUG_ON(!list_empty(&cgrp->children));
1753  BUG_ON(!list_empty(&cgrp->sibling));
1754 
1755  mutex_lock(&cgroup_mutex);
1756  mutex_lock(&cgroup_root_mutex);
1757 
1758  /* Rebind all subsystems back to the default hierarchy */
1759  ret = rebind_subsystems(root, 0);
1760  /* Shouldn't be able to fail ... */
1761  BUG_ON(ret);
1762 
1763  /*
1764  * Release all the links from css_sets to this hierarchy's
1765  * root cgroup
1766  */
1767  write_lock(&css_set_lock);
1768 
1769  list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1770  cgrp_link_list) {
1771  list_del(&link->cg_link_list);
1772  list_del(&link->cgrp_link_list);
1773  kfree(link);
1774  }
1775  write_unlock(&css_set_lock);
1776 
1777  if (!list_empty(&root->root_list)) {
1778  list_del(&root->root_list);
1779  root_count--;
1780  }
1781 
1782  mutex_unlock(&cgroup_root_mutex);
1783  mutex_unlock(&cgroup_mutex);
1784 
1785  simple_xattrs_free(&cgrp->xattrs);
1786 
1787  kill_litter_super(sb);
1788  cgroup_drop_root(root);
1789 }
1790 
1791 static struct file_system_type cgroup_fs_type = {
1792  .name = "cgroup",
1793  .mount = cgroup_mount,
1794  .kill_sb = cgroup_kill_sb,
1795 };
1796 
1797 static struct kobject *cgroup_kobj;
1798 
1809 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810 {
1811  char *start;
1812  struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1814 
1815  if (!dentry || cgrp == dummytop) {
1816  /*
1817  * Inactive subsystems have no dentry for their root
1818  * cgroup
1819  */
1820  strcpy(buf, "/");
1821  return 0;
1822  }
1823 
1824  start = buf + buflen;
1825 
1826  *--start = '\0';
1827  for (;;) {
1828  int len = dentry->d_name.len;
1829 
1830  if ((start -= len) < buf)
1831  return -ENAMETOOLONG;
1832  memcpy(start, dentry->d_name.name, len);
1833  cgrp = cgrp->parent;
1834  if (!cgrp)
1835  break;
1836 
1837  dentry = rcu_dereference_check(cgrp->dentry,
1839  if (!cgrp->parent)
1840  continue;
1841  if (--start < buf)
1842  return -ENAMETOOLONG;
1843  *start = '/';
1844  }
1845  memmove(buf, start, buf + buflen - start);
1846  return 0;
1847 }
1849 
1850 /*
1851  * Control Group taskset
1852  */
1855  struct cgroup *cgrp;
1856  struct css_set *cg;
1857 };
1858 
1863  int idx;
1864  struct cgroup *cur_cgrp;
1865 };
1866 
1874 {
1875  if (tset->tc_array) {
1876  tset->idx = 0;
1877  return cgroup_taskset_next(tset);
1878  } else {
1879  tset->cur_cgrp = tset->single.cgrp;
1880  return tset->single.task;
1881  }
1882 }
1884 
1893 {
1894  struct task_and_cgroup *tc;
1895 
1896  if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1897  return NULL;
1898 
1899  tc = flex_array_get(tset->tc_array, tset->idx++);
1900  tset->cur_cgrp = tc->cgrp;
1901  return tc->task;
1902 }
1904 
1913 struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1914 {
1915  return tset->cur_cgrp;
1916 }
1918 
1924 {
1925  return tset->tc_array ? tset->tc_array_len : 1;
1926 }
1928 
1929 
1930 /*
1931  * cgroup_task_migrate - move a task from one cgroup to another.
1932  *
1933  * 'guarantee' is set if the caller promises that a new css_set for the task
1934  * will already exist. If not set, this function might sleep, and can fail with
1935  * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936  */
1937 static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938  struct task_struct *tsk, struct css_set *newcg)
1939 {
1940  struct css_set *oldcg;
1941 
1942  /*
1943  * We are synchronized through threadgroup_lock() against PF_EXITING
1944  * setting such that we can't race against cgroup_exit() changing the
1945  * css_set to init_css_set and dropping the old one.
1946  */
1947  WARN_ON_ONCE(tsk->flags & PF_EXITING);
1948  oldcg = tsk->cgroups;
1949 
1950  task_lock(tsk);
1951  rcu_assign_pointer(tsk->cgroups, newcg);
1952  task_unlock(tsk);
1953 
1954  /* Update the css_set linked lists if we're using them */
1955  write_lock(&css_set_lock);
1956  if (!list_empty(&tsk->cg_list))
1957  list_move(&tsk->cg_list, &newcg->tasks);
1958  write_unlock(&css_set_lock);
1959 
1960  /*
1961  * We just gained a reference on oldcg by taking it from the task. As
1962  * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1963  * it here; it will be freed under RCU.
1964  */
1965  set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1966  put_css_set(oldcg);
1967 }
1968 
1977 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1978 {
1979  int retval = 0;
1980  struct cgroup_subsys *ss, *failed_ss = NULL;
1981  struct cgroup *oldcgrp;
1982  struct cgroupfs_root *root = cgrp->root;
1983  struct cgroup_taskset tset = { };
1984  struct css_set *newcg;
1985 
1986  /* @tsk either already exited or can't exit until the end */
1987  if (tsk->flags & PF_EXITING)
1988  return -ESRCH;
1989 
1990  /* Nothing to do if the task is already in that cgroup */
1991  oldcgrp = task_cgroup_from_root(tsk, root);
1992  if (cgrp == oldcgrp)
1993  return 0;
1994 
1995  tset.single.task = tsk;
1996  tset.single.cgrp = oldcgrp;
1997 
1998  for_each_subsys(root, ss) {
1999  if (ss->can_attach) {
2000  retval = ss->can_attach(cgrp, &tset);
2001  if (retval) {
2002  /*
2003  * Remember on which subsystem the can_attach()
2004  * failed, so that we only call cancel_attach()
2005  * against the subsystems whose can_attach()
2006  * succeeded. (See below)
2007  */
2008  failed_ss = ss;
2009  goto out;
2010  }
2011  }
2012  }
2013 
2014  newcg = find_css_set(tsk->cgroups, cgrp);
2015  if (!newcg) {
2016  retval = -ENOMEM;
2017  goto out;
2018  }
2019 
2020  cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
2021 
2022  for_each_subsys(root, ss) {
2023  if (ss->attach)
2024  ss->attach(cgrp, &tset);
2025  }
2026 
2027  synchronize_rcu();
2028 
2029  /*
2030  * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031  * is no longer empty.
2032  */
2033  cgroup_wakeup_rmdir_waiter(cgrp);
2034 out:
2035  if (retval) {
2036  for_each_subsys(root, ss) {
2037  if (ss == failed_ss)
2038  /*
2039  * This subsystem was the one that failed the
2040  * can_attach() check earlier, so we don't need
2041  * to call cancel_attach() against it or any
2042  * remaining subsystems.
2043  */
2044  break;
2045  if (ss->cancel_attach)
2046  ss->cancel_attach(cgrp, &tset);
2047  }
2048  }
2049  return retval;
2050 }
2051 
2058 {
2059  struct cgroupfs_root *root;
2060  int retval = 0;
2061 
2062  cgroup_lock();
2063  for_each_active_root(root) {
2064  struct cgroup *from_cg = task_cgroup_from_root(from, root);
2065 
2066  retval = cgroup_attach_task(from_cg, tsk);
2067  if (retval)
2068  break;
2069  }
2070  cgroup_unlock();
2071 
2072  return retval;
2073 }
2075 
2084 static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2085 {
2086  int retval, i, group_size;
2087  struct cgroup_subsys *ss, *failed_ss = NULL;
2088  /* guaranteed to be initialized later, but the compiler needs this */
2089  struct cgroupfs_root *root = cgrp->root;
2090  /* threadgroup list cursor and array */
2091  struct task_struct *tsk;
2092  struct task_and_cgroup *tc;
2093  struct flex_array *group;
2094  struct cgroup_taskset tset = { };
2095 
2096  /*
2097  * step 0: in order to do expensive, possibly blocking operations for
2098  * every thread, we cannot iterate the thread group list, since it needs
2099  * rcu or tasklist locked. instead, build an array of all threads in the
2100  * group - group_rwsem prevents new threads from appearing, and if
2101  * threads exit, this will just be an over-estimate.
2102  */
2103  group_size = get_nr_threads(leader);
2104  /* flex_array supports very large thread-groups better than kmalloc. */
2105  group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2106  if (!group)
2107  return -ENOMEM;
2108  /* pre-allocate to guarantee space while iterating in rcu read-side. */
2109  retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2110  if (retval)
2111  goto out_free_group_list;
2112 
2113  tsk = leader;
2114  i = 0;
2115  /*
2116  * Prevent freeing of tasks while we take a snapshot. Tasks that are
2117  * already PF_EXITING could be freed from underneath us unless we
2118  * take an rcu_read_lock.
2119  */
2120  rcu_read_lock();
2121  do {
2122  struct task_and_cgroup ent;
2123 
2124  /* @tsk either already exited or can't exit until the end */
2125  if (tsk->flags & PF_EXITING)
2126  continue;
2127 
2128  /* as per above, nr_threads may decrease, but not increase. */
2129  BUG_ON(i >= group_size);
2130  ent.task = tsk;
2131  ent.cgrp = task_cgroup_from_root(tsk, root);
2132  /* nothing to do if this task is already in the cgroup */
2133  if (ent.cgrp == cgrp)
2134  continue;
2135  /*
2136  * saying GFP_ATOMIC has no effect here because we did prealloc
2137  * earlier, but it's good form to communicate our expectations.
2138  */
2139  retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2140  BUG_ON(retval != 0);
2141  i++;
2142  } while_each_thread(leader, tsk);
2143  rcu_read_unlock();
2144  /* remember the number of threads in the array for later. */
2145  group_size = i;
2146  tset.tc_array = group;
2147  tset.tc_array_len = group_size;
2148 
2149  /* methods shouldn't be called if no task is actually migrating */
2150  retval = 0;
2151  if (!group_size)
2152  goto out_free_group_list;
2153 
2154  /*
2155  * step 1: check that we can legitimately attach to the cgroup.
2156  */
2157  for_each_subsys(root, ss) {
2158  if (ss->can_attach) {
2159  retval = ss->can_attach(cgrp, &tset);
2160  if (retval) {
2161  failed_ss = ss;
2162  goto out_cancel_attach;
2163  }
2164  }
2165  }
2166 
2167  /*
2168  * step 2: make sure css_sets exist for all threads to be migrated.
2169  * we use find_css_set, which allocates a new one if necessary.
2170  */
2171  for (i = 0; i < group_size; i++) {
2172  tc = flex_array_get(group, i);
2173  tc->cg = find_css_set(tc->task->cgroups, cgrp);
2174  if (!tc->cg) {
2175  retval = -ENOMEM;
2176  goto out_put_css_set_refs;
2177  }
2178  }
2179 
2180  /*
2181  * step 3: now that we're guaranteed success wrt the css_sets,
2182  * proceed to move all tasks to the new cgroup. There are no
2183  * failure cases after here, so this is the commit point.
2184  */
2185  for (i = 0; i < group_size; i++) {
2186  tc = flex_array_get(group, i);
2187  cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2188  }
2189  /* nothing is sensitive to fork() after this point. */
2190 
2191  /*
2192  * step 4: do subsystem attach callbacks.
2193  */
2194  for_each_subsys(root, ss) {
2195  if (ss->attach)
2196  ss->attach(cgrp, &tset);
2197  }
2198 
2199  /*
2200  * step 5: success! and cleanup
2201  */
2202  synchronize_rcu();
2203  cgroup_wakeup_rmdir_waiter(cgrp);
2204  retval = 0;
2205 out_put_css_set_refs:
2206  if (retval) {
2207  for (i = 0; i < group_size; i++) {
2208  tc = flex_array_get(group, i);
2209  if (!tc->cg)
2210  break;
2211  put_css_set(tc->cg);
2212  }
2213  }
2214 out_cancel_attach:
2215  if (retval) {
2216  for_each_subsys(root, ss) {
2217  if (ss == failed_ss)
2218  break;
2219  if (ss->cancel_attach)
2220  ss->cancel_attach(cgrp, &tset);
2221  }
2222  }
2223 out_free_group_list:
2224  flex_array_free(group);
2225  return retval;
2226 }
2227 
2228 /*
2229  * Find the task_struct of the task to attach by vpid and pass it along to the
2230  * function to attach either it or all tasks in its threadgroup. Will lock
2231  * cgroup_mutex and threadgroup; may take task_lock of task.
2232  */
2233 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2234 {
2235  struct task_struct *tsk;
2236  const struct cred *cred = current_cred(), *tcred;
2237  int ret;
2238 
2239  if (!cgroup_lock_live_group(cgrp))
2240  return -ENODEV;
2241 
2242 retry_find_task:
2243  rcu_read_lock();
2244  if (pid) {
2245  tsk = find_task_by_vpid(pid);
2246  if (!tsk) {
2247  rcu_read_unlock();
2248  ret= -ESRCH;
2249  goto out_unlock_cgroup;
2250  }
2251  /*
2252  * even if we're attaching all tasks in the thread group, we
2253  * only need to check permissions on one of them.
2254  */
2255  tcred = __task_cred(tsk);
2256  if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2257  !uid_eq(cred->euid, tcred->uid) &&
2258  !uid_eq(cred->euid, tcred->suid)) {
2259  rcu_read_unlock();
2260  ret = -EACCES;
2261  goto out_unlock_cgroup;
2262  }
2263  } else
2264  tsk = current;
2265 
2266  if (threadgroup)
2267  tsk = tsk->group_leader;
2268 
2269  /*
2270  * Workqueue threads may acquire PF_THREAD_BOUND and become
2271  * trapped in a cpuset, or RT worker may be born in a cgroup
2272  * with no rt_runtime allocated. Just say no.
2273  */
2274  if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2275  ret = -EINVAL;
2276  rcu_read_unlock();
2277  goto out_unlock_cgroup;
2278  }
2279 
2280  get_task_struct(tsk);
2281  rcu_read_unlock();
2282 
2283  threadgroup_lock(tsk);
2284  if (threadgroup) {
2285  if (!thread_group_leader(tsk)) {
2286  /*
2287  * a race with de_thread from another thread's exec()
2288  * may strip us of our leadership, if this happens,
2289  * there is no choice but to throw this task away and
2290  * try again; this is
2291  * "double-double-toil-and-trouble-check locking".
2292  */
2293  threadgroup_unlock(tsk);
2294  put_task_struct(tsk);
2295  goto retry_find_task;
2296  }
2297  ret = cgroup_attach_proc(cgrp, tsk);
2298  } else
2299  ret = cgroup_attach_task(cgrp, tsk);
2300  threadgroup_unlock(tsk);
2301 
2302  put_task_struct(tsk);
2303 out_unlock_cgroup:
2304  cgroup_unlock();
2305  return ret;
2306 }
2307 
2308 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2309 {
2310  return attach_task_by_pid(cgrp, pid, false);
2311 }
2312 
2313 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2314 {
2315  return attach_task_by_pid(cgrp, tgid, true);
2316 }
2317 
2325 bool cgroup_lock_live_group(struct cgroup *cgrp)
2326 {
2327  mutex_lock(&cgroup_mutex);
2328  if (cgroup_is_removed(cgrp)) {
2329  mutex_unlock(&cgroup_mutex);
2330  return false;
2331  }
2332  return true;
2333 }
2335 
2336 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2337  const char *buffer)
2338 {
2339  BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2340  if (strlen(buffer) >= PATH_MAX)
2341  return -EINVAL;
2342  if (!cgroup_lock_live_group(cgrp))
2343  return -ENODEV;
2344  mutex_lock(&cgroup_root_mutex);
2345  strcpy(cgrp->root->release_agent_path, buffer);
2346  mutex_unlock(&cgroup_root_mutex);
2347  cgroup_unlock();
2348  return 0;
2349 }
2350 
2351 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2352  struct seq_file *seq)
2353 {
2354  if (!cgroup_lock_live_group(cgrp))
2355  return -ENODEV;
2356  seq_puts(seq, cgrp->root->release_agent_path);
2357  seq_putc(seq, '\n');
2358  cgroup_unlock();
2359  return 0;
2360 }
2361 
2362 /* A buffer size big enough for numbers or short strings */
2363 #define CGROUP_LOCAL_BUFFER_SIZE 64
2364 
2365 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2366  struct file *file,
2367  const char __user *userbuf,
2368  size_t nbytes, loff_t *unused_ppos)
2369 {
2370  char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2371  int retval = 0;
2372  char *end;
2373 
2374  if (!nbytes)
2375  return -EINVAL;
2376  if (nbytes >= sizeof(buffer))
2377  return -E2BIG;
2378  if (copy_from_user(buffer, userbuf, nbytes))
2379  return -EFAULT;
2380 
2381  buffer[nbytes] = 0; /* nul-terminate */
2382  if (cft->write_u64) {
2383  u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2384  if (*end)
2385  return -EINVAL;
2386  retval = cft->write_u64(cgrp, cft, val);
2387  } else {
2388  s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2389  if (*end)
2390  return -EINVAL;
2391  retval = cft->write_s64(cgrp, cft, val);
2392  }
2393  if (!retval)
2394  retval = nbytes;
2395  return retval;
2396 }
2397 
2398 static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2399  struct file *file,
2400  const char __user *userbuf,
2401  size_t nbytes, loff_t *unused_ppos)
2402 {
2403  char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2404  int retval = 0;
2405  size_t max_bytes = cft->max_write_len;
2406  char *buffer = local_buffer;
2407 
2408  if (!max_bytes)
2409  max_bytes = sizeof(local_buffer) - 1;
2410  if (nbytes >= max_bytes)
2411  return -E2BIG;
2412  /* Allocate a dynamic buffer if we need one */
2413  if (nbytes >= sizeof(local_buffer)) {
2414  buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2415  if (buffer == NULL)
2416  return -ENOMEM;
2417  }
2418  if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2419  retval = -EFAULT;
2420  goto out;
2421  }
2422 
2423  buffer[nbytes] = 0; /* nul-terminate */
2424  retval = cft->write_string(cgrp, cft, strstrip(buffer));
2425  if (!retval)
2426  retval = nbytes;
2427 out:
2428  if (buffer != local_buffer)
2429  kfree(buffer);
2430  return retval;
2431 }
2432 
2433 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2434  size_t nbytes, loff_t *ppos)
2435 {
2436  struct cftype *cft = __d_cft(file->f_dentry);
2437  struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2438 
2439  if (cgroup_is_removed(cgrp))
2440  return -ENODEV;
2441  if (cft->write)
2442  return cft->write(cgrp, cft, file, buf, nbytes, ppos);
2443  if (cft->write_u64 || cft->write_s64)
2444  return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
2445  if (cft->write_string)
2446  return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
2447  if (cft->trigger) {
2448  int ret = cft->trigger(cgrp, (unsigned int)cft->private);
2449  return ret ? ret : nbytes;
2450  }
2451  return -EINVAL;
2452 }
2453 
2454 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
2455  struct file *file,
2456  char __user *buf, size_t nbytes,
2457  loff_t *ppos)
2458 {
2459  char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2460  u64 val = cft->read_u64(cgrp, cft);
2461  int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2462 
2463  return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2464 }
2465 
2466 static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
2467  struct file *file,
2468  char __user *buf, size_t nbytes,
2469  loff_t *ppos)
2470 {
2471  char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2472  s64 val = cft->read_s64(cgrp, cft);
2473  int len = sprintf(tmp, "%lld\n", (long long) val);
2474 
2475  return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2476 }
2477 
2478 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2479  size_t nbytes, loff_t *ppos)
2480 {
2481  struct cftype *cft = __d_cft(file->f_dentry);
2482  struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2483 
2484  if (cgroup_is_removed(cgrp))
2485  return -ENODEV;
2486 
2487  if (cft->read)
2488  return cft->read(cgrp, cft, file, buf, nbytes, ppos);
2489  if (cft->read_u64)
2490  return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
2491  if (cft->read_s64)
2492  return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
2493  return -EINVAL;
2494 }
2495 
2496 /*
2497  * seqfile ops/methods for returning structured data. Currently just
2498  * supports string->u64 maps, but can be extended in future.
2499  */
2500 
2502  struct cftype *cft;
2503  struct cgroup *cgroup;
2504 };
2505 
2506 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2507 {
2508  struct seq_file *sf = cb->state;
2509  return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2510 }
2511 
2512 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2513 {
2514  struct cgroup_seqfile_state *state = m->private;
2515  struct cftype *cft = state->cft;
2516  if (cft->read_map) {
2517  struct cgroup_map_cb cb = {
2518  .fill = cgroup_map_add,
2519  .state = m,
2520  };
2521  return cft->read_map(state->cgroup, cft, &cb);
2522  }
2523  return cft->read_seq_string(state->cgroup, cft, m);
2524 }
2525 
2526 static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2527 {
2528  struct seq_file *seq = file->private_data;
2529  kfree(seq->private);
2530  return single_release(inode, file);
2531 }
2532 
2533 static const struct file_operations cgroup_seqfile_operations = {
2534  .read = seq_read,
2535  .write = cgroup_file_write,
2536  .llseek = seq_lseek,
2537  .release = cgroup_seqfile_release,
2538 };
2539 
2540 static int cgroup_file_open(struct inode *inode, struct file *file)
2541 {
2542  int err;
2543  struct cftype *cft;
2544 
2545  err = generic_file_open(inode, file);
2546  if (err)
2547  return err;
2548  cft = __d_cft(file->f_dentry);
2549 
2550  if (cft->read_map || cft->read_seq_string) {
2551  struct cgroup_seqfile_state *state =
2552  kzalloc(sizeof(*state), GFP_USER);
2553  if (!state)
2554  return -ENOMEM;
2555  state->cft = cft;
2556  state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2557  file->f_op = &cgroup_seqfile_operations;
2558  err = single_open(file, cgroup_seqfile_show, state);
2559  if (err < 0)
2560  kfree(state);
2561  } else if (cft->open)
2562  err = cft->open(inode, file);
2563  else
2564  err = 0;
2565 
2566  return err;
2567 }
2568 
2569 static int cgroup_file_release(struct inode *inode, struct file *file)
2570 {
2571  struct cftype *cft = __d_cft(file->f_dentry);
2572  if (cft->release)
2573  return cft->release(inode, file);
2574  return 0;
2575 }
2576 
2577 /*
2578  * cgroup_rename - Only allow simple rename of directories in place.
2579  */
2580 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2581  struct inode *new_dir, struct dentry *new_dentry)
2582 {
2583  if (!S_ISDIR(old_dentry->d_inode->i_mode))
2584  return -ENOTDIR;
2585  if (new_dentry->d_inode)
2586  return -EEXIST;
2587  if (old_dir != new_dir)
2588  return -EIO;
2589  return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2590 }
2591 
2592 static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2593 {
2594  if (S_ISDIR(dentry->d_inode->i_mode))
2595  return &__d_cgrp(dentry)->xattrs;
2596  else
2597  return &__d_cft(dentry)->xattrs;
2598 }
2599 
2600 static inline int xattr_enabled(struct dentry *dentry)
2601 {
2602  struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2603  return test_bit(ROOT_XATTR, &root->flags);
2604 }
2605 
2606 static bool is_valid_xattr(const char *name)
2607 {
2610  return true;
2611  return false;
2612 }
2613 
2614 static int cgroup_setxattr(struct dentry *dentry, const char *name,
2615  const void *val, size_t size, int flags)
2616 {
2617  if (!xattr_enabled(dentry))
2618  return -EOPNOTSUPP;
2619  if (!is_valid_xattr(name))
2620  return -EINVAL;
2621  return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2622 }
2623 
2624 static int cgroup_removexattr(struct dentry *dentry, const char *name)
2625 {
2626  if (!xattr_enabled(dentry))
2627  return -EOPNOTSUPP;
2628  if (!is_valid_xattr(name))
2629  return -EINVAL;
2630  return simple_xattr_remove(__d_xattrs(dentry), name);
2631 }
2632 
2633 static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2634  void *buf, size_t size)
2635 {
2636  if (!xattr_enabled(dentry))
2637  return -EOPNOTSUPP;
2638  if (!is_valid_xattr(name))
2639  return -EINVAL;
2640  return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2641 }
2642 
2643 static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2644 {
2645  if (!xattr_enabled(dentry))
2646  return -EOPNOTSUPP;
2647  return simple_xattr_list(__d_xattrs(dentry), buf, size);
2648 }
2649 
2650 static const struct file_operations cgroup_file_operations = {
2651  .read = cgroup_file_read,
2652  .write = cgroup_file_write,
2653  .llseek = generic_file_llseek,
2654  .open = cgroup_file_open,
2655  .release = cgroup_file_release,
2656 };
2657 
2658 static const struct inode_operations cgroup_file_inode_operations = {
2659  .setxattr = cgroup_setxattr,
2660  .getxattr = cgroup_getxattr,
2661  .listxattr = cgroup_listxattr,
2662  .removexattr = cgroup_removexattr,
2663 };
2664 
2665 static const struct inode_operations cgroup_dir_inode_operations = {
2666  .lookup = cgroup_lookup,
2667  .mkdir = cgroup_mkdir,
2668  .rmdir = cgroup_rmdir,
2669  .rename = cgroup_rename,
2670  .setxattr = cgroup_setxattr,
2671  .getxattr = cgroup_getxattr,
2672  .listxattr = cgroup_listxattr,
2673  .removexattr = cgroup_removexattr,
2674 };
2675 
2676 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2677 {
2678  if (dentry->d_name.len > NAME_MAX)
2679  return ERR_PTR(-ENAMETOOLONG);
2680  d_add(dentry, NULL);
2681  return NULL;
2682 }
2683 
2684 /*
2685  * Check if a file is a control file
2686  */
2687 static inline struct cftype *__file_cft(struct file *file)
2688 {
2689  if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2690  return ERR_PTR(-EINVAL);
2691  return __d_cft(file->f_dentry);
2692 }
2693 
2694 static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2695  struct super_block *sb)
2696 {
2697  struct inode *inode;
2698 
2699  if (!dentry)
2700  return -ENOENT;
2701  if (dentry->d_inode)
2702  return -EEXIST;
2703 
2704  inode = cgroup_new_inode(mode, sb);
2705  if (!inode)
2706  return -ENOMEM;
2707 
2708  if (S_ISDIR(mode)) {
2709  inode->i_op = &cgroup_dir_inode_operations;
2710  inode->i_fop = &simple_dir_operations;
2711 
2712  /* start off with i_nlink == 2 (for "." entry) */
2713  inc_nlink(inode);
2714 
2715  /* start with the directory inode held, so that we can
2716  * populate it without racing with another mkdir */
2718  } else if (S_ISREG(mode)) {
2719  inode->i_size = 0;
2720  inode->i_fop = &cgroup_file_operations;
2721  inode->i_op = &cgroup_file_inode_operations;
2722  }
2723  d_instantiate(dentry, inode);
2724  dget(dentry); /* Extra count - pin the dentry in core */
2725  return 0;
2726 }
2727 
2728 /*
2729  * cgroup_create_dir - create a directory for an object.
2730  * @cgrp: the cgroup we create the directory for. It must have a valid
2731  * ->parent field. And we are going to fill its ->dentry field.
2732  * @dentry: dentry of the new cgroup
2733  * @mode: mode to set on new directory.
2734  */
2735 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736  umode_t mode)
2737 {
2738  struct dentry *parent;
2739  int error = 0;
2740 
2741  parent = cgrp->parent->dentry;
2742  error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743  if (!error) {
2744  dentry->d_fsdata = cgrp;
2745  inc_nlink(parent->d_inode);
2746  rcu_assign_pointer(cgrp->dentry, dentry);
2747  dget(dentry);
2748  }
2749  dput(dentry);
2750 
2751  return error;
2752 }
2753 
2763 static umode_t cgroup_file_mode(const struct cftype *cft)
2764 {
2765  umode_t mode = 0;
2766 
2767  if (cft->mode)
2768  return cft->mode;
2769 
2770  if (cft->read || cft->read_u64 || cft->read_s64 ||
2771  cft->read_map || cft->read_seq_string)
2772  mode |= S_IRUGO;
2773 
2774  if (cft->write || cft->write_u64 || cft->write_s64 ||
2775  cft->write_string || cft->trigger)
2776  mode |= S_IWUSR;
2777 
2778  return mode;
2779 }
2780 
2781 static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2782  struct cftype *cft)
2783 {
2784  struct dentry *dir = cgrp->dentry;
2785  struct cgroup *parent = __d_cgrp(dir);
2786  struct dentry *dentry;
2787  struct cfent *cfe;
2788  int error;
2789  umode_t mode;
2790  char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2791 
2792  simple_xattrs_init(&cft->xattrs);
2793 
2794  /* does @cft->flags tell us to skip creation on @cgrp? */
2795  if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796  return 0;
2797  if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798  return 0;
2799 
2800  if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801  strcpy(name, subsys->name);
2802  strcat(name, ".");
2803  }
2804  strcat(name, cft->name);
2805 
2806  BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2807 
2808  cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2809  if (!cfe)
2810  return -ENOMEM;
2811 
2812  dentry = lookup_one_len(name, dir, strlen(name));
2813  if (IS_ERR(dentry)) {
2814  error = PTR_ERR(dentry);
2815  goto out;
2816  }
2817 
2818  mode = cgroup_file_mode(cft);
2819  error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2820  if (!error) {
2821  cfe->type = (void *)cft;
2822  cfe->dentry = dentry;
2823  dentry->d_fsdata = cfe;
2824  list_add_tail(&cfe->node, &parent->files);
2825  cfe = NULL;
2826  }
2827  dput(dentry);
2828 out:
2829  kfree(cfe);
2830  return error;
2831 }
2832 
2833 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2834  struct cftype cfts[], bool is_add)
2835 {
2836  struct cftype *cft;
2837  int err, ret = 0;
2838 
2839  for (cft = cfts; cft->name[0] != '\0'; cft++) {
2840  if (is_add)
2841  err = cgroup_add_file(cgrp, subsys, cft);
2842  else
2843  err = cgroup_rm_file(cgrp, cft);
2844  if (err) {
2845  pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2846  is_add ? "add" : "remove", cft->name, err);
2847  ret = err;
2848  }
2849  }
2850  return ret;
2851 }
2852 
2853 static DEFINE_MUTEX(cgroup_cft_mutex);
2854 
2855 static void cgroup_cfts_prepare(void)
2856  __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2857 {
2858  /*
2859  * Thanks to the entanglement with vfs inode locking, we can't walk
2860  * the existing cgroups under cgroup_mutex and create files.
2861  * Instead, we increment reference on all cgroups and build list of
2862  * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2863  * exclusive access to the field.
2864  */
2865  mutex_lock(&cgroup_cft_mutex);
2866  mutex_lock(&cgroup_mutex);
2867 }
2868 
2869 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2870  struct cftype *cfts, bool is_add)
2871  __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2872 {
2873  LIST_HEAD(pending);
2874  struct cgroup *cgrp, *n;
2875 
2876  /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2877  if (cfts && ss->root != &rootnode) {
2878  list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2879  dget(cgrp->dentry);
2880  list_add_tail(&cgrp->cft_q_node, &pending);
2881  }
2882  }
2883 
2884  mutex_unlock(&cgroup_mutex);
2885 
2886  /*
2887  * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2888  * files for all cgroups which were created before.
2889  */
2890  list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2891  struct inode *inode = cgrp->dentry->d_inode;
2892 
2893  mutex_lock(&inode->i_mutex);
2894  mutex_lock(&cgroup_mutex);
2895  if (!cgroup_is_removed(cgrp))
2896  cgroup_addrm_files(cgrp, ss, cfts, is_add);
2897  mutex_unlock(&cgroup_mutex);
2898  mutex_unlock(&inode->i_mutex);
2899 
2900  list_del_init(&cgrp->cft_q_node);
2901  dput(cgrp->dentry);
2902  }
2903 
2904  mutex_unlock(&cgroup_cft_mutex);
2905 }
2906 
2921 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2922 {
2923  struct cftype_set *set;
2924 
2925  set = kzalloc(sizeof(*set), GFP_KERNEL);
2926  if (!set)
2927  return -ENOMEM;
2928 
2929  cgroup_cfts_prepare();
2930  set->cfts = cfts;
2931  list_add_tail(&set->node, &ss->cftsets);
2932  cgroup_cfts_commit(ss, cfts, true);
2933 
2934  return 0;
2935 }
2937 
2951 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2952 {
2953  struct cftype_set *set;
2954 
2955  cgroup_cfts_prepare();
2956 
2957  list_for_each_entry(set, &ss->cftsets, node) {
2958  if (set->cfts == cfts) {
2959  list_del_init(&set->node);
2960  cgroup_cfts_commit(ss, cfts, false);
2961  return 0;
2962  }
2963  }
2964 
2965  cgroup_cfts_commit(ss, NULL, false);
2966  return -ENOENT;
2967 }
2968 
2975 int cgroup_task_count(const struct cgroup *cgrp)
2976 {
2977  int count = 0;
2978  struct cg_cgroup_link *link;
2979 
2980  read_lock(&css_set_lock);
2981  list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
2982  count += atomic_read(&link->cg->refcount);
2983  }
2984  read_unlock(&css_set_lock);
2985  return count;
2986 }
2987 
2988 /*
2989  * Advance a list_head iterator. The iterator should be positioned at
2990  * the start of a css_set
2991  */
2992 static void cgroup_advance_iter(struct cgroup *cgrp,
2993  struct cgroup_iter *it)
2994 {
2995  struct list_head *l = it->cg_link;
2996  struct cg_cgroup_link *link;
2997  struct css_set *cg;
2998 
2999  /* Advance to the next non-empty css_set */
3000  do {
3001  l = l->next;
3002  if (l == &cgrp->css_sets) {
3003  it->cg_link = NULL;
3004  return;
3005  }
3006  link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
3007  cg = link->cg;
3008  } while (list_empty(&cg->tasks));
3009  it->cg_link = l;
3010  it->task = cg->tasks.next;
3011 }
3012 
3013 /*
3014  * To reduce the fork() overhead for systems that are not actually
3015  * using their cgroups capability, we don't maintain the lists running
3016  * through each css_set to its tasks until we see the list actually
3017  * used - in other words after the first call to cgroup_iter_start().
3018  */
3019 static void cgroup_enable_task_cg_lists(void)
3020 {
3021  struct task_struct *p, *g;
3022  write_lock(&css_set_lock);
3023  use_task_css_set_links = 1;
3024  /*
3025  * We need tasklist_lock because RCU is not safe against
3026  * while_each_thread(). Besides, a forking task that has passed
3027  * cgroup_post_fork() without seeing use_task_css_set_links = 1
3028  * is not guaranteed to have its child immediately visible in the
3029  * tasklist if we walk through it with RCU.
3030  */
3031  read_lock(&tasklist_lock);
3032  do_each_thread(g, p) {
3033  task_lock(p);
3034  /*
3035  * We should check if the process is exiting, otherwise
3036  * it will race with cgroup_exit() in that the list
3037  * entry won't be deleted though the process has exited.
3038  */
3039  if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
3040  list_add(&p->cg_list, &p->cgroups->tasks);
3041  task_unlock(p);
3042  } while_each_thread(g, p);
3043  read_unlock(&tasklist_lock);
3044  write_unlock(&css_set_lock);
3045 }
3046 
3047 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048  __acquires(css_set_lock)
3049 {
3050  /*
3051  * The first time anyone tries to iterate across a cgroup,
3052  * we need to enable the list linking each css_set to its
3053  * tasks, and fix up all existing tasks.
3054  */
3055  if (!use_task_css_set_links)
3056  cgroup_enable_task_cg_lists();
3057 
3058  read_lock(&css_set_lock);
3059  it->cg_link = &cgrp->css_sets;
3060  cgroup_advance_iter(cgrp, it);
3061 }
3062 
3063 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3064  struct cgroup_iter *it)
3065 {
3066  struct task_struct *res;
3067  struct list_head *l = it->task;
3068  struct cg_cgroup_link *link;
3069 
3070  /* If the iterator cg is NULL, we have no tasks */
3071  if (!it->cg_link)
3072  return NULL;
3073  res = list_entry(l, struct task_struct, cg_list);
3074  /* Advance iterator to find next entry */
3075  l = l->next;
3076  link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
3077  if (l == &link->cg->tasks) {
3078  /* We reached the end of this task list - move on to
3079  * the next cg_cgroup_link */
3080  cgroup_advance_iter(cgrp, it);
3081  } else {
3082  it->task = l;
3083  }
3084  return res;
3085 }
3086 
3087 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
3088  __releases(css_set_lock)
3089 {
3090  read_unlock(&css_set_lock);
3091 }
3092 
3093 static inline int started_after_time(struct task_struct *t1,
3094  struct timespec *time,
3095  struct task_struct *t2)
3096 {
3097  int start_diff = timespec_compare(&t1->start_time, time);
3098  if (start_diff > 0) {
3099  return 1;
3100  } else if (start_diff < 0) {
3101  return 0;
3102  } else {
3103  /*
3104  * Arbitrarily, if two processes started at the same
3105  * time, we'll say that the lower pointer value
3106  * started first. Note that t2 may have exited by now
3107  * so this may not be a valid pointer any longer, but
3108  * that's fine - it still serves to distinguish
3109  * between two tasks started (effectively) simultaneously.
3110  */
3111  return t1 > t2;
3112  }
3113 }
3114 
3115 /*
3116  * This function is a callback from heap_insert() and is used to order
3117  * the heap.
3118  * In this case we order the heap in descending task start time.
3119  */
3120 static inline int started_after(void *p1, void *p2)
3121 {
3122  struct task_struct *t1 = p1;
3123  struct task_struct *t2 = p2;
3124  return started_after_time(t1, &t2->start_time, t2);
3125 }
3126 
3154 int cgroup_scan_tasks(struct cgroup_scanner *scan)
3155 {
3156  int retval, i;
3157  struct cgroup_iter it;
3158  struct task_struct *p, *dropped;
3159  /* Never dereference latest_task, since it's not refcounted */
3160  struct task_struct *latest_task = NULL;
3161  struct ptr_heap tmp_heap;
3162  struct ptr_heap *heap;
3163  struct timespec latest_time = { 0, 0 };
3164 
3165  if (scan->heap) {
3166  /* The caller supplied our heap and pre-allocated its memory */
3167  heap = scan->heap;
3168  heap->gt = &started_after;
3169  } else {
3170  /* We need to allocate our own heap memory */
3171  heap = &tmp_heap;
3172  retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3173  if (retval)
3174  /* cannot allocate the heap */
3175  return retval;
3176  }
3177 
3178  again:
3179  /*
3180  * Scan tasks in the cgroup, using the scanner's "test_task" callback
3181  * to determine which are of interest, and using the scanner's
3182  * "process_task" callback to process any of them that need an update.
3183  * Since we don't want to hold any locks during the task updates,
3184  * gather tasks to be processed in a heap structure.
3185  * The heap is sorted by descending task start time.
3186  * If the statically-sized heap fills up, we overflow tasks that
3187  * started later, and in future iterations only consider tasks that
3188  * started after the latest task in the previous pass. This
3189  * guarantees forward progress and that we don't miss any tasks.
3190  */
3191  heap->size = 0;
3192  cgroup_iter_start(scan->cg, &it);
3193  while ((p = cgroup_iter_next(scan->cg, &it))) {
3194  /*
3195  * Only affect tasks that qualify per the caller's callback,
3196  * if he provided one
3197  */
3198  if (scan->test_task && !scan->test_task(p, scan))
3199  continue;
3200  /*
3201  * Only process tasks that started after the last task
3202  * we processed
3203  */
3204  if (!started_after_time(p, &latest_time, latest_task))
3205  continue;
3206  dropped = heap_insert(heap, p);
3207  if (dropped == NULL) {
3208  /*
3209  * The new task was inserted; the heap wasn't
3210  * previously full
3211  */
3212  get_task_struct(p);
3213  } else if (dropped != p) {
3214  /*
3215  * The new task was inserted, and pushed out a
3216  * different task
3217  */
3218  get_task_struct(p);
3219  put_task_struct(dropped);
3220  }
3221  /*
3222  * Else the new task was newer than anything already in
3223  * the heap and wasn't inserted
3224  */
3225  }
3226  cgroup_iter_end(scan->cg, &it);
3227 
3228  if (heap->size) {
3229  for (i = 0; i < heap->size; i++) {
3230  struct task_struct *q = heap->ptrs[i];
3231  if (i == 0) {
3232  latest_time = q->start_time;
3233  latest_task = q;
3234  }
3235  /* Process the task per the caller's callback */
3236  scan->process_task(q, scan);
3237  put_task_struct(q);
3238  }
3239  /*
3240  * If we had to process any tasks at all, scan again
3241  * in case some of them were in the middle of forking
3242  * children that didn't get processed.
3243  * Not the most efficient way to do it, but it avoids
3244  * having to take callback_mutex in the fork path
3245  */
3246  goto again;
3247  }
3248  if (heap == &tmp_heap)
3249  heap_free(&tmp_heap);
3250  return 0;
3251 }
3252 
3253 /*
3254  * Stuff for reading the 'tasks'/'procs' files.
3255  *
3256  * Reading this file can return large amounts of data if a cgroup has
3257  * *lots* of attached tasks. So it may need several calls to read(),
3258  * but we cannot guarantee that the information we produce is correct
3259  * unless we produce it entirely atomically.
3260  *
3261  */
3262 
3263 /* which pidlist file are we talking about? */
3267 };
3268 
3269 /*
3270  * A pidlist is a list of pids that virtually represents the contents of one
3271  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3272  * a pair (one each for procs, tasks) for each pid namespace that's relevant
3273  * to the cgroup.
3274  */
3276  /*
3277  * used to find which pidlist is wanted. doesn't change as long as
3278  * this particular list stays in the list.
3279  */
3280  struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3281  /* array of xids */
3283  /* how many elements the above list has */
3284  int length;
3285  /* how many files are using the current array */
3287  /* each of these stored in a list by its cgroup */
3289  /* pointer to the cgroup we belong to, for list removal purposes */
3290  struct cgroup *owner;
3291  /* protects the other fields */
3293 };
3294 
3295 /*
3296  * The following two functions "fix" the issue where there are more pids
3297  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3298  * TODO: replace with a kernel-wide solution to this problem
3299  */
3300 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3301 static void *pidlist_allocate(int count)
3302 {
3303  if (PIDLIST_TOO_LARGE(count))
3304  return vmalloc(count * sizeof(pid_t));
3305  else
3306  return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3307 }
3308 static void pidlist_free(void *p)
3309 {
3310  if (is_vmalloc_addr(p))
3311  vfree(p);
3312  else
3313  kfree(p);
3314 }
3315 static void *pidlist_resize(void *p, int newcount)
3316 {
3317  void *newlist;
3318  /* note: if new alloc fails, old p will still be valid either way */
3319  if (is_vmalloc_addr(p)) {
3320  newlist = vmalloc(newcount * sizeof(pid_t));
3321  if (!newlist)
3322  return NULL;
3323  memcpy(newlist, p, newcount * sizeof(pid_t));
3324  vfree(p);
3325  } else {
3326  newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3327  }
3328  return newlist;
3329 }
3330 
3331 /*
3332  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3333  * If the new stripped list is sufficiently smaller and there's enough memory
3334  * to allocate a new buffer, will let go of the unneeded memory. Returns the
3335  * number of unique elements.
3336  */
3337 /* is the size difference enough that we should re-allocate the array? */
3338 #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3339 static int pidlist_uniq(pid_t **p, int length)
3340 {
3341  int src, dest = 1;
3342  pid_t *list = *p;
3343  pid_t *newlist;
3344 
3345  /*
3346  * we presume the 0th element is unique, so i starts at 1. trivial
3347  * edge cases first; no work needs to be done for either
3348  */
3349  if (length == 0 || length == 1)
3350  return length;
3351  /* src and dest walk down the list; dest counts unique elements */
3352  for (src = 1; src < length; src++) {
3353  /* find next unique element */
3354  while (list[src] == list[src-1]) {
3355  src++;
3356  if (src == length)
3357  goto after;
3358  }
3359  /* dest always points to where the next unique element goes */
3360  list[dest] = list[src];
3361  dest++;
3362  }
3363 after:
3364  /*
3365  * if the length difference is large enough, we want to allocate a
3366  * smaller buffer to save memory. if this fails due to out of memory,
3367  * we'll just stay with what we've got.
3368  */
3369  if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3370  newlist = pidlist_resize(list, dest);
3371  if (newlist)
3372  *p = newlist;
3373  }
3374  return dest;
3375 }
3376 
3377 static int cmppid(const void *a, const void *b)
3378 {
3379  return *(pid_t *)a - *(pid_t *)b;
3380 }
3381 
3382 /*
3383  * find the appropriate pidlist for our purpose (given procs vs tasks)
3384  * returns with the lock on that pidlist already held, and takes care
3385  * of the use count, or returns NULL with no locks held if we're out of
3386  * memory.
3387  */
3388 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3389  enum cgroup_filetype type)
3390 {
3391  struct cgroup_pidlist *l;
3392  /* don't need task_nsproxy() if we're looking at ourself */
3393  struct pid_namespace *ns = current->nsproxy->pid_ns;
3394 
3395  /*
3396  * We can't drop the pidlist_mutex before taking the l->mutex in case
3397  * the last ref-holder is trying to remove l from the list at the same
3398  * time. Holding the pidlist_mutex precludes somebody taking whichever
3399  * list we find out from under us - compare release_pid_array().
3400  */
3401  mutex_lock(&cgrp->pidlist_mutex);
3402  list_for_each_entry(l, &cgrp->pidlists, links) {
3403  if (l->key.type == type && l->key.ns == ns) {
3404  /* make sure l doesn't vanish out from under us */
3405  down_write(&l->mutex);
3406  mutex_unlock(&cgrp->pidlist_mutex);
3407  return l;
3408  }
3409  }
3410  /* entry not found; create a new one */
3411  l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3412  if (!l) {
3413  mutex_unlock(&cgrp->pidlist_mutex);
3414  return l;
3415  }
3416  init_rwsem(&l->mutex);
3417  down_write(&l->mutex);
3418  l->key.type = type;
3419  l->key.ns = get_pid_ns(ns);
3420  l->use_count = 0; /* don't increment here */
3421  l->list = NULL;
3422  l->owner = cgrp;
3423  list_add(&l->links, &cgrp->pidlists);
3424  mutex_unlock(&cgrp->pidlist_mutex);
3425  return l;
3426 }
3427 
3428 /*
3429  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3430  */
3431 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3432  struct cgroup_pidlist **lp)
3433 {
3434  pid_t *array;
3435  int length;
3436  int pid, n = 0; /* used for populating the array */
3437  struct cgroup_iter it;
3438  struct task_struct *tsk;
3439  struct cgroup_pidlist *l;
3440 
3441  /*
3442  * If cgroup gets more users after we read count, we won't have
3443  * enough space - tough. This race is indistinguishable to the
3444  * caller from the case that the additional cgroup users didn't
3445  * show up until sometime later on.
3446  */
3447  length = cgroup_task_count(cgrp);
3448  array = pidlist_allocate(length);
3449  if (!array)
3450  return -ENOMEM;
3451  /* now, populate the array */
3452  cgroup_iter_start(cgrp, &it);
3453  while ((tsk = cgroup_iter_next(cgrp, &it))) {
3454  if (unlikely(n == length))
3455  break;
3456  /* get tgid or pid for procs or tasks file respectively */
3457  if (type == CGROUP_FILE_PROCS)
3458  pid = task_tgid_vnr(tsk);
3459  else
3460  pid = task_pid_vnr(tsk);
3461  if (pid > 0) /* make sure to only use valid results */
3462  array[n++] = pid;
3463  }
3464  cgroup_iter_end(cgrp, &it);
3465  length = n;
3466  /* now sort & (if procs) strip out duplicates */
3467  sort(array, length, sizeof(pid_t), cmppid, NULL);
3468  if (type == CGROUP_FILE_PROCS)
3469  length = pidlist_uniq(&array, length);
3470  l = cgroup_pidlist_find(cgrp, type);
3471  if (!l) {
3472  pidlist_free(array);
3473  return -ENOMEM;
3474  }
3475  /* store array, freeing old if necessary - lock already held */
3476  pidlist_free(l->list);
3477  l->list = array;
3478  l->length = length;
3479  l->use_count++;
3480  up_write(&l->mutex);
3481  *lp = l;
3482  return 0;
3483 }
3484 
3494 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3495 {
3496  int ret = -EINVAL;
3497  struct cgroup *cgrp;
3498  struct cgroup_iter it;
3499  struct task_struct *tsk;
3500 
3501  /*
3502  * Validate dentry by checking the superblock operations,
3503  * and make sure it's a directory.
3504  */
3505  if (dentry->d_sb->s_op != &cgroup_ops ||
3506  !S_ISDIR(dentry->d_inode->i_mode))
3507  goto err;
3508 
3509  ret = 0;
3510  cgrp = dentry->d_fsdata;
3511 
3512  cgroup_iter_start(cgrp, &it);
3513  while ((tsk = cgroup_iter_next(cgrp, &it))) {
3514  switch (tsk->state) {
3515  case TASK_RUNNING:
3516  stats->nr_running++;
3517  break;
3518  case TASK_INTERRUPTIBLE:
3519  stats->nr_sleeping++;
3520  break;
3521  case TASK_UNINTERRUPTIBLE:
3522  stats->nr_uninterruptible++;
3523  break;
3524  case TASK_STOPPED:
3525  stats->nr_stopped++;
3526  break;
3527  default:
3528  if (delayacct_is_task_waiting_on_io(tsk))
3529  stats->nr_io_wait++;
3530  break;
3531  }
3532  }
3533  cgroup_iter_end(cgrp, &it);
3534 
3535 err:
3536  return ret;
3537 }
3538 
3539 
3540 /*
3541  * seq_file methods for the tasks/procs files. The seq_file position is the
3542  * next pid to display; the seq_file iterator is a pointer to the pid
3543  * in the cgroup->l->list array.
3544  */
3545 
3546 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3547 {
3548  /*
3549  * Initially we receive a position value that corresponds to
3550  * one more than the last pid shown (or 0 on the first call or
3551  * after a seek to the start). Use a binary-search to find the
3552  * next pid to display, if any
3553  */
3554  struct cgroup_pidlist *l = s->private;
3555  int index = 0, pid = *pos;
3556  int *iter;
3557 
3558  down_read(&l->mutex);
3559  if (pid) {
3560  int end = l->length;
3561 
3562  while (index < end) {
3563  int mid = (index + end) / 2;
3564  if (l->list[mid] == pid) {
3565  index = mid;
3566  break;
3567  } else if (l->list[mid] <= pid)
3568  index = mid + 1;
3569  else
3570  end = mid;
3571  }
3572  }
3573  /* If we're off the end of the array, we're done */
3574  if (index >= l->length)
3575  return NULL;
3576  /* Update the abstract position to be the actual pid that we found */
3577  iter = l->list + index;
3578  *pos = *iter;
3579  return iter;
3580 }
3581 
3582 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3583 {
3584  struct cgroup_pidlist *l = s->private;
3585  up_read(&l->mutex);
3586 }
3587 
3588 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3589 {
3590  struct cgroup_pidlist *l = s->private;
3591  pid_t *p = v;
3592  pid_t *end = l->list + l->length;
3593  /*
3594  * Advance to the next pid in the array. If this goes off the
3595  * end, we're done
3596  */
3597  p++;
3598  if (p >= end) {
3599  return NULL;
3600  } else {
3601  *pos = *p;
3602  return p;
3603  }
3604 }
3605 
3606 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3607 {
3608  return seq_printf(s, "%d\n", *(int *)v);
3609 }
3610 
3611 /*
3612  * seq_operations functions for iterating on pidlists through seq_file -
3613  * independent of whether it's tasks or procs
3614  */
3615 static const struct seq_operations cgroup_pidlist_seq_operations = {
3616  .start = cgroup_pidlist_start,
3617  .stop = cgroup_pidlist_stop,
3618  .next = cgroup_pidlist_next,
3619  .show = cgroup_pidlist_show,
3620 };
3621 
3622 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3623 {
3624  /*
3625  * the case where we're the last user of this particular pidlist will
3626  * have us remove it from the cgroup's list, which entails taking the
3627  * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3628  * pidlist_mutex, we have to take pidlist_mutex first.
3629  */
3630  mutex_lock(&l->owner->pidlist_mutex);
3631  down_write(&l->mutex);
3632  BUG_ON(!l->use_count);
3633  if (!--l->use_count) {
3634  /* we're the last user if refcount is 0; remove and free */
3635  list_del(&l->links);
3636  mutex_unlock(&l->owner->pidlist_mutex);
3637  pidlist_free(l->list);
3638  put_pid_ns(l->key.ns);
3639  up_write(&l->mutex);
3640  kfree(l);
3641  return;
3642  }
3643  mutex_unlock(&l->owner->pidlist_mutex);
3644  up_write(&l->mutex);
3645 }
3646 
3647 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3648 {
3649  struct cgroup_pidlist *l;
3650  if (!(file->f_mode & FMODE_READ))
3651  return 0;
3652  /*
3653  * the seq_file will only be initialized if the file was opened for
3654  * reading; hence we check if it's not null only in that case.
3655  */
3656  l = ((struct seq_file *)file->private_data)->private;
3657  cgroup_release_pid_array(l);
3658  return seq_release(inode, file);
3659 }
3660 
3661 static const struct file_operations cgroup_pidlist_operations = {
3662  .read = seq_read,
3663  .llseek = seq_lseek,
3664  .write = cgroup_file_write,
3665  .release = cgroup_pidlist_release,
3666 };
3667 
3668 /*
3669  * The following functions handle opens on a file that displays a pidlist
3670  * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3671  * in the cgroup.
3672  */
3673 /* helper function for the two below it */
3674 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3675 {
3676  struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3677  struct cgroup_pidlist *l;
3678  int retval;
3679 
3680  /* Nothing to do for write-only files */
3681  if (!(file->f_mode & FMODE_READ))
3682  return 0;
3683 
3684  /* have the array populated */
3685  retval = pidlist_array_load(cgrp, type, &l);
3686  if (retval)
3687  return retval;
3688  /* configure file information */
3689  file->f_op = &cgroup_pidlist_operations;
3690 
3691  retval = seq_open(file, &cgroup_pidlist_seq_operations);
3692  if (retval) {
3693  cgroup_release_pid_array(l);
3694  return retval;
3695  }
3696  ((struct seq_file *)file->private_data)->private = l;
3697  return 0;
3698 }
3699 static int cgroup_tasks_open(struct inode *unused, struct file *file)
3700 {
3701  return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3702 }
3703 static int cgroup_procs_open(struct inode *unused, struct file *file)
3704 {
3705  return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3706 }
3707 
3708 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
3709  struct cftype *cft)
3710 {
3711  return notify_on_release(cgrp);
3712 }
3713 
3714 static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3715  struct cftype *cft,
3716  u64 val)
3717 {
3718  clear_bit(CGRP_RELEASABLE, &cgrp->flags);
3719  if (val)
3720  set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3721  else
3722  clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3723  return 0;
3724 }
3725 
3726 /*
3727  * Unregister event and free resources.
3728  *
3729  * Gets called from workqueue.
3730  */
3731 static void cgroup_event_remove(struct work_struct *work)
3732 {
3733  struct cgroup_event *event = container_of(work, struct cgroup_event,
3734  remove);
3735  struct cgroup *cgrp = event->cgrp;
3736 
3737  event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3738 
3739  eventfd_ctx_put(event->eventfd);
3740  kfree(event);
3741  dput(cgrp->dentry);
3742 }
3743 
3744 /*
3745  * Gets called on POLLHUP on eventfd when user closes it.
3746  *
3747  * Called with wqh->lock held and interrupts disabled.
3748  */
3749 static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3750  int sync, void *key)
3751 {
3752  struct cgroup_event *event = container_of(wait,
3753  struct cgroup_event, wait);
3754  struct cgroup *cgrp = event->cgrp;
3755  unsigned long flags = (unsigned long)key;
3756 
3757  if (flags & POLLHUP) {
3758  __remove_wait_queue(event->wqh, &event->wait);
3759  spin_lock(&cgrp->event_list_lock);
3760  list_del(&event->list);
3761  spin_unlock(&cgrp->event_list_lock);
3762  /*
3763  * We are in atomic context, but cgroup_event_remove() may
3764  * sleep, so we have to call it in workqueue.
3765  */
3766  schedule_work(&event->remove);
3767  }
3768 
3769  return 0;
3770 }
3771 
3772 static void cgroup_event_ptable_queue_proc(struct file *file,
3773  wait_queue_head_t *wqh, poll_table *pt)
3774 {
3775  struct cgroup_event *event = container_of(pt,
3776  struct cgroup_event, pt);
3777 
3778  event->wqh = wqh;
3779  add_wait_queue(wqh, &event->wait);
3780 }
3781 
3782 /*
3783  * Parse input and register new cgroup event handler.
3784  *
3785  * Input must be in format '<event_fd> <control_fd> <args>'.
3786  * Interpretation of args is defined by control file implementation.
3787  */
3788 static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3789  const char *buffer)
3790 {
3791  struct cgroup_event *event = NULL;
3792  unsigned int efd, cfd;
3793  struct file *efile = NULL;
3794  struct file *cfile = NULL;
3795  char *endp;
3796  int ret;
3797 
3798  efd = simple_strtoul(buffer, &endp, 10);
3799  if (*endp != ' ')
3800  return -EINVAL;
3801  buffer = endp + 1;
3802 
3803  cfd = simple_strtoul(buffer, &endp, 10);
3804  if ((*endp != ' ') && (*endp != '\0'))
3805  return -EINVAL;
3806  buffer = endp + 1;
3807 
3808  event = kzalloc(sizeof(*event), GFP_KERNEL);
3809  if (!event)
3810  return -ENOMEM;
3811  event->cgrp = cgrp;
3812  INIT_LIST_HEAD(&event->list);
3813  init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3814  init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3815  INIT_WORK(&event->remove, cgroup_event_remove);
3816 
3817  efile = eventfd_fget(efd);
3818  if (IS_ERR(efile)) {
3819  ret = PTR_ERR(efile);
3820  goto fail;
3821  }
3822 
3823  event->eventfd = eventfd_ctx_fileget(efile);
3824  if (IS_ERR(event->eventfd)) {
3825  ret = PTR_ERR(event->eventfd);
3826  goto fail;
3827  }
3828 
3829  cfile = fget(cfd);
3830  if (!cfile) {
3831  ret = -EBADF;
3832  goto fail;
3833  }
3834 
3835  /* the process need read permission on control file */
3836  /* AV: shouldn't we check that it's been opened for read instead? */
3837  ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3838  if (ret < 0)
3839  goto fail;
3840 
3841  event->cft = __file_cft(cfile);
3842  if (IS_ERR(event->cft)) {
3843  ret = PTR_ERR(event->cft);
3844  goto fail;
3845  }
3846 
3847  if (!event->cft->register_event || !event->cft->unregister_event) {
3848  ret = -EINVAL;
3849  goto fail;
3850  }
3851 
3852  ret = event->cft->register_event(cgrp, event->cft,
3853  event->eventfd, buffer);
3854  if (ret)
3855  goto fail;
3856 
3857  if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3858  event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3859  ret = 0;
3860  goto fail;
3861  }
3862 
3863  /*
3864  * Events should be removed after rmdir of cgroup directory, but before
3865  * destroying subsystem state objects. Let's take reference to cgroup
3866  * directory dentry to do that.
3867  */
3868  dget(cgrp->dentry);
3869 
3870  spin_lock(&cgrp->event_list_lock);
3871  list_add(&event->list, &cgrp->event_list);
3872  spin_unlock(&cgrp->event_list_lock);
3873 
3874  fput(cfile);
3875  fput(efile);
3876 
3877  return 0;
3878 
3879 fail:
3880  if (cfile)
3881  fput(cfile);
3882 
3883  if (event && event->eventfd && !IS_ERR(event->eventfd))
3884  eventfd_ctx_put(event->eventfd);
3885 
3886  if (!IS_ERR_OR_NULL(efile))
3887  fput(efile);
3888 
3889  kfree(event);
3890 
3891  return ret;
3892 }
3893 
3894 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895  struct cftype *cft)
3896 {
3897  return clone_children(cgrp);
3898 }
3899 
3900 static int cgroup_clone_children_write(struct cgroup *cgrp,
3901  struct cftype *cft,
3902  u64 val)
3903 {
3904  if (val)
3905  set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3906  else
3907  clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3908  return 0;
3909 }
3910 
3911 /*
3912  * for the common functions, 'private' gives the type of file
3913  */
3914 /* for hysterical raisins, we can't put this on the older files */
3915 #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3916 static struct cftype files[] = {
3917  {
3918  .name = "tasks",
3919  .open = cgroup_tasks_open,
3920  .write_u64 = cgroup_tasks_write,
3921  .release = cgroup_pidlist_release,
3922  .mode = S_IRUGO | S_IWUSR,
3923  },
3924  {
3925  .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3926  .open = cgroup_procs_open,
3927  .write_u64 = cgroup_procs_write,
3928  .release = cgroup_pidlist_release,
3929  .mode = S_IRUGO | S_IWUSR,
3930  },
3931  {
3932  .name = "notify_on_release",
3933  .read_u64 = cgroup_read_notify_on_release,
3934  .write_u64 = cgroup_write_notify_on_release,
3935  },
3936  {
3937  .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3938  .write_string = cgroup_write_event_control,
3939  .mode = S_IWUGO,
3940  },
3941  {
3942  .name = "cgroup.clone_children",
3943  .read_u64 = cgroup_clone_children_read,
3944  .write_u64 = cgroup_clone_children_write,
3945  },
3946  {
3947  .name = "release_agent",
3948  .flags = CFTYPE_ONLY_ON_ROOT,
3949  .read_seq_string = cgroup_release_agent_show,
3950  .write_string = cgroup_release_agent_write,
3951  .max_write_len = PATH_MAX,
3952  },
3953  { } /* terminate */
3954 };
3955 
3962 static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3963  unsigned long subsys_mask)
3964 {
3965  int err;
3966  struct cgroup_subsys *ss;
3967 
3968  if (base_files) {
3969  err = cgroup_addrm_files(cgrp, NULL, files, true);
3970  if (err < 0)
3971  return err;
3972  }
3973 
3974  /* process cftsets of each subsystem */
3975  for_each_subsys(cgrp->root, ss) {
3976  struct cftype_set *set;
3977  if (!test_bit(ss->subsys_id, &subsys_mask))
3978  continue;
3979 
3980  list_for_each_entry(set, &ss->cftsets, node)
3981  cgroup_addrm_files(cgrp, ss, set->cfts, true);
3982  }
3983 
3984  /* This cgroup is ready now */
3985  for_each_subsys(cgrp->root, ss) {
3986  struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3987  /*
3988  * Update id->css pointer and make this css visible from
3989  * CSS ID functions. This pointer will be dereferened
3990  * from RCU-read-side without locks.
3991  */
3992  if (css->id)
3993  rcu_assign_pointer(css->id->css, css);
3994  }
3995 
3996  return 0;
3997 }
3998 
3999 static void css_dput_fn(struct work_struct *work)
4000 {
4001  struct cgroup_subsys_state *css =
4002  container_of(work, struct cgroup_subsys_state, dput_work);
4003  struct dentry *dentry = css->cgroup->dentry;
4004  struct super_block *sb = dentry->d_sb;
4005 
4006  atomic_inc(&sb->s_active);
4007  dput(dentry);
4008  deactivate_super(sb);
4009 }
4010 
4011 static void init_cgroup_css(struct cgroup_subsys_state *css,
4012  struct cgroup_subsys *ss,
4013  struct cgroup *cgrp)
4014 {
4015  css->cgroup = cgrp;
4016  atomic_set(&css->refcnt, 1);
4017  css->flags = 0;
4018  css->id = NULL;
4019  if (cgrp == dummytop)
4020  set_bit(CSS_ROOT, &css->flags);
4021  BUG_ON(cgrp->subsys[ss->subsys_id]);
4022  cgrp->subsys[ss->subsys_id] = css;
4023 
4024  /*
4025  * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
4026  * which is put on the last css_put(). dput() requires process
4027  * context, which css_put() may be called without. @css->dput_work
4028  * will be used to invoke dput() asynchronously from css_put().
4029  */
4030  INIT_WORK(&css->dput_work, css_dput_fn);
4031  if (ss->__DEPRECATED_clear_css_refs)
4032  set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
4033 }
4034 
4035 /*
4036  * cgroup_create - create a cgroup
4037  * @parent: cgroup that will be parent of the new cgroup
4038  * @dentry: dentry of the new cgroup
4039  * @mode: mode to set on new inode
4040  *
4041  * Must be called with the mutex on the parent inode held
4042  */
4043 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4044  umode_t mode)
4045 {
4046  struct cgroup *cgrp;
4047  struct cgroupfs_root *root = parent->root;
4048  int err = 0;
4049  struct cgroup_subsys *ss;
4050  struct super_block *sb = root->sb;
4051 
4052  cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053  if (!cgrp)
4054  return -ENOMEM;
4055 
4056  /* Grab a reference on the superblock so the hierarchy doesn't
4057  * get deleted on unmount if there are child cgroups. This
4058  * can be done outside cgroup_mutex, since the sb can't
4059  * disappear while someone has an open control file on the
4060  * fs */
4061  atomic_inc(&sb->s_active);
4062 
4063  mutex_lock(&cgroup_mutex);
4064 
4065  init_cgroup_housekeeping(cgrp);
4066 
4067  cgrp->parent = parent;
4068  cgrp->root = parent->root;
4069  cgrp->top_cgroup = parent->top_cgroup;
4070 
4071  if (notify_on_release(parent))
4072  set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 
4074  if (clone_children(parent))
4075  set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
4076 
4077  for_each_subsys(root, ss) {
4078  struct cgroup_subsys_state *css;
4079 
4080  css = ss->create(cgrp);
4081  if (IS_ERR(css)) {
4082  err = PTR_ERR(css);
4083  goto err_destroy;
4084  }
4085  init_cgroup_css(css, ss, cgrp);
4086  if (ss->use_id) {
4087  err = alloc_css_id(ss, parent, cgrp);
4088  if (err)
4089  goto err_destroy;
4090  }
4091  /* At error, ->destroy() callback has to free assigned ID. */
4092  if (clone_children(parent) && ss->post_clone)
4093  ss->post_clone(cgrp);
4094 
4095  if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096  parent->parent) {
4097  pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4098  current->comm, current->pid, ss->name);
4099  if (!strcmp(ss->name, "memory"))
4100  pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4101  ss->warned_broken_hierarchy = true;
4102  }
4103  }
4104 
4105  list_add(&cgrp->sibling, &cgrp->parent->children);
4106  root->number_of_cgroups++;
4107 
4108  err = cgroup_create_dir(cgrp, dentry, mode);
4109  if (err < 0)
4110  goto err_remove;
4111 
4112  /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113  for_each_subsys(root, ss)
4114  if (!ss->__DEPRECATED_clear_css_refs)
4115  dget(dentry);
4116 
4117  /* The cgroup directory was pre-locked for us */
4118  BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119 
4120  list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121 
4122  err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123  /* If err < 0, we have a half-filled directory - oh well ;) */
4124 
4125  mutex_unlock(&cgroup_mutex);
4126  mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 
4128  return 0;
4129 
4130  err_remove:
4131 
4132  list_del(&cgrp->sibling);
4133  root->number_of_cgroups--;
4134 
4135  err_destroy:
4136 
4137  for_each_subsys(root, ss) {
4138  if (cgrp->subsys[ss->subsys_id])
4139  ss->destroy(cgrp);
4140  }
4141 
4142  mutex_unlock(&cgroup_mutex);
4143 
4144  /* Release the reference count that we took on the superblock */
4145  deactivate_super(sb);
4146 
4147  kfree(cgrp);
4148  return err;
4149 }
4150 
4151 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4152 {
4153  struct cgroup *c_parent = dentry->d_parent->d_fsdata;
4154 
4155  /* the vfs holds inode->i_mutex already */
4156  return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4157 }
4158 
4159 /*
4160  * Check the reference count on each subsystem. Since we already
4161  * established that there are no tasks in the cgroup, if the css refcount
4162  * is also 1, then there should be no outstanding references, so the
4163  * subsystem is safe to destroy. We scan across all subsystems rather than
4164  * using the per-hierarchy linked list of mounted subsystems since we can
4165  * be called via check_for_release() with no synchronization other than
4166  * RCU, and the subsystem linked list isn't RCU-safe.
4167  */
4168 static int cgroup_has_css_refs(struct cgroup *cgrp)
4169 {
4170  int i;
4171 
4172  /*
4173  * We won't need to lock the subsys array, because the subsystems
4174  * we're concerned about aren't going anywhere since our cgroup root
4175  * has a reference on them.
4176  */
4177  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4178  struct cgroup_subsys *ss = subsys[i];
4179  struct cgroup_subsys_state *css;
4180 
4181  /* Skip subsystems not present or not in this hierarchy */
4182  if (ss == NULL || ss->root != cgrp->root)
4183  continue;
4184 
4185  css = cgrp->subsys[ss->subsys_id];
4186  /*
4187  * When called from check_for_release() it's possible
4188  * that by this point the cgroup has been removed
4189  * and the css deleted. But a false-positive doesn't
4190  * matter, since it can only happen if the cgroup
4191  * has been deleted and hence no longer needs the
4192  * release agent to be called anyway.
4193  */
4194  if (css && css_refcnt(css) > 1)
4195  return 1;
4196  }
4197  return 0;
4198 }
4199 
4200 /*
4201  * Atomically mark all (or else none) of the cgroup's CSS objects as
4202  * CSS_REMOVED. Return true on success, or false if the cgroup has
4203  * busy subsystems. Call with cgroup_mutex held
4204  *
4205  * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206  * not, cgroup removal behaves differently.
4207  *
4208  * If clear is set, css refcnt for the subsystem should be zero before
4209  * cgroup removal can be committed. This is implemented by
4210  * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211  * called multiple times until all css refcnts reach zero and is allowed to
4212  * veto removal on any invocation. This behavior is deprecated and will be
4213  * removed as soon as the existing user (memcg) is updated.
4214  *
4215  * If clear is not set, each css holds an extra reference to the cgroup's
4216  * dentry and cgroup removal proceeds regardless of css refs.
4217  * ->pre_destroy() will be called at least once and is not allowed to fail.
4218  * On the last put of each css, whenever that may be, the extra dentry ref
4219  * is put so that dentry destruction happens only after all css's are
4220  * released.
4221  */
4222 static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223 {
4224  struct cgroup_subsys *ss;
4225  unsigned long flags;
4226  bool failed = false;
4227 
4228  local_irq_save(flags);
4229 
4230  /*
4231  * Block new css_tryget() by deactivating refcnt. If all refcnts
4232  * for subsystems w/ clear_css_refs set were 1 at the moment of
4233  * deactivation, we succeeded.
4234  */
4235  for_each_subsys(cgrp->root, ss) {
4236  struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 
4238  WARN_ON(atomic_read(&css->refcnt) < 0);
4239  atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240 
4241  if (ss->__DEPRECATED_clear_css_refs)
4242  failed |= css_refcnt(css) != 1;
4243  }
4244 
4245  /*
4246  * If succeeded, set REMOVED and put all the base refs; otherwise,
4247  * restore refcnts to positive values. Either way, all in-progress
4248  * css_tryget() will be released.
4249  */
4250  for_each_subsys(cgrp->root, ss) {
4251  struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252 
4253  if (!failed) {
4254  set_bit(CSS_REMOVED, &css->flags);
4255  css_put(css);
4256  } else {
4257  atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258  }
4259  }
4260 
4261  local_irq_restore(flags);
4262  return !failed;
4263 }
4264 
4265 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266 {
4267  struct cgroup *cgrp = dentry->d_fsdata;
4268  struct dentry *d;
4269  struct cgroup *parent;
4270  DEFINE_WAIT(wait);
4271  struct cgroup_event *event, *tmp;
4272  int ret;
4273 
4274  /* the vfs holds both inode->i_mutex already */
4275 again:
4276  mutex_lock(&cgroup_mutex);
4277  if (atomic_read(&cgrp->count) != 0) {
4278  mutex_unlock(&cgroup_mutex);
4279  return -EBUSY;
4280  }
4281  if (!list_empty(&cgrp->children)) {
4282  mutex_unlock(&cgroup_mutex);
4283  return -EBUSY;
4284  }
4285  mutex_unlock(&cgroup_mutex);
4286 
4287  /*
4288  * In general, subsystem has no css->refcnt after pre_destroy(). But
4289  * in racy cases, subsystem may have to get css->refcnt after
4290  * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291  * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292  * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293  * and subsystem's reference count handling. Please see css_get/put
4294  * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295  */
4296  set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 
4298  /*
4299  * Call pre_destroy handlers of subsys. Notify subsystems
4300  * that rmdir() request comes.
4301  */
4302  ret = cgroup_call_pre_destroy(cgrp);
4303  if (ret) {
4304  clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305  return ret;
4306  }
4307 
4308  mutex_lock(&cgroup_mutex);
4309  parent = cgrp->parent;
4310  if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311  clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312  mutex_unlock(&cgroup_mutex);
4313  return -EBUSY;
4314  }
4315  prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316  if (!cgroup_clear_css_refs(cgrp)) {
4317  mutex_unlock(&cgroup_mutex);
4318  /*
4319  * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320  * prepare_to_wait(), we need to check this flag.
4321  */
4322  if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323  schedule();
4324  finish_wait(&cgroup_rmdir_waitq, &wait);
4325  clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326  if (signal_pending(current))
4327  return -EINTR;
4328  goto again;
4329  }
4330  /* NO css_tryget() can success after here. */
4331  finish_wait(&cgroup_rmdir_waitq, &wait);
4332  clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 
4334  raw_spin_lock(&release_list_lock);
4335  set_bit(CGRP_REMOVED, &cgrp->flags);
4336  if (!list_empty(&cgrp->release_list))
4337  list_del_init(&cgrp->release_list);
4338  raw_spin_unlock(&release_list_lock);
4339 
4340  /* delete this cgroup from parent->children */
4341  list_del_init(&cgrp->sibling);
4342 
4343  list_del_init(&cgrp->allcg_node);
4344 
4345  d = dget(cgrp->dentry);
4346 
4347  cgroup_d_remove_dir(d);
4348  dput(d);
4349 
4350  set_bit(CGRP_RELEASABLE, &parent->flags);
4351  check_for_release(parent);
4352 
4353  /*
4354  * Unregister events and notify userspace.
4355  * Notify userspace about cgroup removing only after rmdir of cgroup
4356  * directory to avoid race between userspace and kernelspace
4357  */
4358  spin_lock(&cgrp->event_list_lock);
4359  list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4360  list_del(&event->list);
4361  remove_wait_queue(event->wqh, &event->wait);
4362  eventfd_signal(event->eventfd, 1);
4363  schedule_work(&event->remove);
4364  }
4365  spin_unlock(&cgrp->event_list_lock);
4366 
4367  mutex_unlock(&cgroup_mutex);
4368  return 0;
4369 }
4370 
4371 static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372 {
4373  INIT_LIST_HEAD(&ss->cftsets);
4374 
4375  /*
4376  * base_cftset is embedded in subsys itself, no need to worry about
4377  * deregistration.
4378  */
4379  if (ss->base_cftypes) {
4380  ss->base_cftset.cfts = ss->base_cftypes;
4381  list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4382  }
4383 }
4384 
4385 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4386 {
4387  struct cgroup_subsys_state *css;
4388 
4389  printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 
4391  /* init base cftset */
4392  cgroup_init_cftsets(ss);
4393 
4394  /* Create the top cgroup state for this subsystem */
4395  list_add(&ss->sibling, &rootnode.subsys_list);
4396  ss->root = &rootnode;
4397  css = ss->create(dummytop);
4398  /* We don't handle early failures gracefully */
4399  BUG_ON(IS_ERR(css));
4400  init_cgroup_css(css, ss, dummytop);
4401 
4402  /* Update the init_css_set to contain a subsys
4403  * pointer to this state - since the subsystem is
4404  * newly registered, all tasks and hence the
4405  * init_css_set is in the subsystem's top cgroup. */
4406  init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
4407 
4408  need_forkexit_callback |= ss->fork || ss->exit;
4409 
4410  /* At system boot, before all subsystems have been
4411  * registered, no tasks have been forked, so we don't
4412  * need to invoke fork callbacks here. */
4413  BUG_ON(!list_empty(&init_task.tasks));
4414 
4415  ss->active = 1;
4416 
4417  /* this function shouldn't be used with modular subsystems, since they
4418  * need to register a subsys_id, among other things */
4419  BUG_ON(ss->module);
4420 }
4421 
4431 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432 {
4433  int i;
4434  struct cgroup_subsys_state *css;
4435 
4436  /* check name and function validity */
4437  if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438  ss->create == NULL || ss->destroy == NULL)
4439  return -EINVAL;
4440 
4441  /*
4442  * we don't support callbacks in modular subsystems. this check is
4443  * before the ss->module check for consistency; a subsystem that could
4444  * be a module should still have no callbacks even if the user isn't
4445  * compiling it as one.
4446  */
4447  if (ss->fork || ss->exit)
4448  return -EINVAL;
4449 
4450  /*
4451  * an optionally modular subsystem is built-in: we want to do nothing,
4452  * since cgroup_init_subsys will have already taken care of it.
4453  */
4454  if (ss->module == NULL) {
4455  /* a sanity check */
4456  BUG_ON(subsys[ss->subsys_id] != ss);
4457  return 0;
4458  }
4459 
4460  /* init base cftset */
4461  cgroup_init_cftsets(ss);
4462 
4463  mutex_lock(&cgroup_mutex);
4464  subsys[ss->subsys_id] = ss;
4465 
4466  /*
4467  * no ss->create seems to need anything important in the ss struct, so
4468  * this can happen first (i.e. before the rootnode attachment).
4469  */
4470  css = ss->create(dummytop);
4471  if (IS_ERR(css)) {
4472  /* failure case - need to deassign the subsys[] slot. */
4473  subsys[ss->subsys_id] = NULL;
4474  mutex_unlock(&cgroup_mutex);
4475  return PTR_ERR(css);
4476  }
4477 
4478  list_add(&ss->sibling, &rootnode.subsys_list);
4479  ss->root = &rootnode;
4480 
4481  /* our new subsystem will be attached to the dummy hierarchy. */
4482  init_cgroup_css(css, ss, dummytop);
4483  /* init_idr must be after init_cgroup_css because it sets css->id. */
4484  if (ss->use_id) {
4485  int ret = cgroup_init_idr(ss, css);
4486  if (ret) {
4487  dummytop->subsys[ss->subsys_id] = NULL;
4488  ss->destroy(dummytop);
4489  subsys[ss->subsys_id] = NULL;
4490  mutex_unlock(&cgroup_mutex);
4491  return ret;
4492  }
4493  }
4494 
4495  /*
4496  * Now we need to entangle the css into the existing css_sets. unlike
4497  * in cgroup_init_subsys, there are now multiple css_sets, so each one
4498  * will need a new pointer to it; done by iterating the css_set_table.
4499  * furthermore, modifying the existing css_sets will corrupt the hash
4500  * table state, so each changed css_set will need its hash recomputed.
4501  * this is all done under the css_set_lock.
4502  */
4503  write_lock(&css_set_lock);
4504  for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
4505  struct css_set *cg;
4506  struct hlist_node *node, *tmp;
4507  struct hlist_head *bucket = &css_set_table[i], *new_bucket;
4508 
4509  hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
4510  /* skip entries that we already rehashed */
4511  if (cg->subsys[ss->subsys_id])
4512  continue;
4513  /* remove existing entry */
4514  hlist_del(&cg->hlist);
4515  /* set new value */
4516  cg->subsys[ss->subsys_id] = css;
4517  /* recompute hash and restore entry */
4518  new_bucket = css_set_hash(cg->subsys);
4519  hlist_add_head(&cg->hlist, new_bucket);
4520  }
4521  }
4522  write_unlock(&css_set_lock);
4523 
4524  ss->active = 1;
4525 
4526  /* success! */
4527  mutex_unlock(&cgroup_mutex);
4528  return 0;
4529 }
4531 
4540 void cgroup_unload_subsys(struct cgroup_subsys *ss)
4541 {
4542  struct cg_cgroup_link *link;
4543  struct hlist_head *hhead;
4544 
4545  BUG_ON(ss->module == NULL);
4546 
4547  /*
4548  * we shouldn't be called if the subsystem is in use, and the use of
4549  * try_module_get in parse_cgroupfs_options should ensure that it
4550  * doesn't start being used while we're killing it off.
4551  */
4552  BUG_ON(ss->root != &rootnode);
4553 
4554  mutex_lock(&cgroup_mutex);
4555  /* deassign the subsys_id */
4556  subsys[ss->subsys_id] = NULL;
4557 
4558  /* remove subsystem from rootnode's list of subsystems */
4559  list_del_init(&ss->sibling);
4560 
4561  /*
4562  * disentangle the css from all css_sets attached to the dummytop. as
4563  * in loading, we need to pay our respects to the hashtable gods.
4564  */
4565  write_lock(&css_set_lock);
4566  list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4567  struct css_set *cg = link->cg;
4568 
4569  hlist_del(&cg->hlist);
4570  BUG_ON(!cg->subsys[ss->subsys_id]);
4571  cg->subsys[ss->subsys_id] = NULL;
4572  hhead = css_set_hash(cg->subsys);
4573  hlist_add_head(&cg->hlist, hhead);
4574  }
4575  write_unlock(&css_set_lock);
4576 
4577  /*
4578  * remove subsystem's css from the dummytop and free it - need to free
4579  * before marking as null because ss->destroy needs the cgrp->subsys
4580  * pointer to find their state. note that this also takes care of
4581  * freeing the css_id.
4582  */
4583  ss->destroy(dummytop);
4584  dummytop->subsys[ss->subsys_id] = NULL;
4585 
4586  mutex_unlock(&cgroup_mutex);
4587 }
4589 
4597 {
4598  int i;
4599  atomic_set(&init_css_set.refcount, 1);
4600  INIT_LIST_HEAD(&init_css_set.cg_links);
4601  INIT_LIST_HEAD(&init_css_set.tasks);
4602  INIT_HLIST_NODE(&init_css_set.hlist);
4603  css_set_count = 1;
4604  init_cgroup_root(&rootnode);
4605  root_count = 1;
4606  init_task.cgroups = &init_css_set;
4607 
4608  init_css_set_link.cg = &init_css_set;
4609  init_css_set_link.cgrp = dummytop;
4610  list_add(&init_css_set_link.cgrp_link_list,
4611  &rootnode.top_cgroup.css_sets);
4612  list_add(&init_css_set_link.cg_link_list,
4613  &init_css_set.cg_links);
4614 
4615  for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4616  INIT_HLIST_HEAD(&css_set_table[i]);
4617 
4618  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4619  struct cgroup_subsys *ss = subsys[i];
4620 
4621  /* at bootup time, we don't worry about modular subsystems */
4622  if (!ss || ss->module)
4623  continue;
4624 
4625  BUG_ON(!ss->name);
4626  BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627  BUG_ON(!ss->create);
4628  BUG_ON(!ss->destroy);
4629  if (ss->subsys_id != i) {
4630  printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631  ss->name, ss->subsys_id);
4632  BUG();
4633  }
4634 
4635  if (ss->early_init)
4636  cgroup_init_subsys(ss);
4637  }
4638  return 0;
4639 }
4640 
4648 {
4649  int err;
4650  int i;
4651  struct hlist_head *hhead;
4652 
4653  err = bdi_init(&cgroup_backing_dev_info);
4654  if (err)
4655  return err;
4656 
4657  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4658  struct cgroup_subsys *ss = subsys[i];
4659 
4660  /* at bootup time, we don't worry about modular subsystems */
4661  if (!ss || ss->module)
4662  continue;
4663  if (!ss->early_init)
4664  cgroup_init_subsys(ss);
4665  if (ss->use_id)
4666  cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4667  }
4668 
4669  /* Add init_css_set to the hash table */
4670  hhead = css_set_hash(init_css_set.subsys);
4671  hlist_add_head(&init_css_set.hlist, hhead);
4672  BUG_ON(!init_root_id(&rootnode));
4673 
4674  cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4675  if (!cgroup_kobj) {
4676  err = -ENOMEM;
4677  goto out;
4678  }
4679 
4680  err = register_filesystem(&cgroup_fs_type);
4681  if (err < 0) {
4682  kobject_put(cgroup_kobj);
4683  goto out;
4684  }
4685 
4686  proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4687 
4688 out:
4689  if (err)
4690  bdi_destroy(&cgroup_backing_dev_info);
4691 
4692  return err;
4693 }
4694 
4695 /*
4696  * proc_cgroup_show()
4697  * - Print task's cgroup paths into seq_file, one line for each hierarchy
4698  * - Used for /proc/<pid>/cgroup.
4699  * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4700  * doesn't really matter if tsk->cgroup changes after we read it,
4701  * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4702  * anyway. No need to check that tsk->cgroup != NULL, thanks to
4703  * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4704  * cgroup to top_cgroup.
4705  */
4706 
4707 /* TODO: Use a proper seq_file iterator */
4708 static int proc_cgroup_show(struct seq_file *m, void *v)
4709 {
4710  struct pid *pid;
4711  struct task_struct *tsk;
4712  char *buf;
4713  int retval;
4714  struct cgroupfs_root *root;
4715 
4716  retval = -ENOMEM;
4717  buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4718  if (!buf)
4719  goto out;
4720 
4721  retval = -ESRCH;
4722  pid = m->private;
4723  tsk = get_pid_task(pid, PIDTYPE_PID);
4724  if (!tsk)
4725  goto out_free;
4726 
4727  retval = 0;
4728 
4729  mutex_lock(&cgroup_mutex);
4730 
4731  for_each_active_root(root) {
4732  struct cgroup_subsys *ss;
4733  struct cgroup *cgrp;
4734  int count = 0;
4735 
4736  seq_printf(m, "%d:", root->hierarchy_id);
4737  for_each_subsys(root, ss)
4738  seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4739  if (strlen(root->name))
4740  seq_printf(m, "%sname=%s", count ? "," : "",
4741  root->name);
4742  seq_putc(m, ':');
4743  cgrp = task_cgroup_from_root(tsk, root);
4744  retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4745  if (retval < 0)
4746  goto out_unlock;
4747  seq_puts(m, buf);
4748  seq_putc(m, '\n');
4749  }
4750 
4751 out_unlock:
4752  mutex_unlock(&cgroup_mutex);
4753  put_task_struct(tsk);
4754 out_free:
4755  kfree(buf);
4756 out:
4757  return retval;
4758 }
4759 
4760 static int cgroup_open(struct inode *inode, struct file *file)
4761 {
4762  struct pid *pid = PROC_I(inode)->pid;
4763  return single_open(file, proc_cgroup_show, pid);
4764 }
4765 
4767  .open = cgroup_open,
4768  .read = seq_read,
4769  .llseek = seq_lseek,
4770  .release = single_release,
4771 };
4772 
4773 /* Display information about each subsystem and each hierarchy */
4774 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4775 {
4776  int i;
4777 
4778  seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4779  /*
4780  * ideally we don't want subsystems moving around while we do this.
4781  * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4782  * subsys/hierarchy state.
4783  */
4784  mutex_lock(&cgroup_mutex);
4785  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4786  struct cgroup_subsys *ss = subsys[i];
4787  if (ss == NULL)
4788  continue;
4789  seq_printf(m, "%s\t%d\t%d\t%d\n",
4790  ss->name, ss->root->hierarchy_id,
4791  ss->root->number_of_cgroups, !ss->disabled);
4792  }
4793  mutex_unlock(&cgroup_mutex);
4794  return 0;
4795 }
4796 
4797 static int cgroupstats_open(struct inode *inode, struct file *file)
4798 {
4799  return single_open(file, proc_cgroupstats_show, NULL);
4800 }
4801 
4802 static const struct file_operations proc_cgroupstats_operations = {
4803  .open = cgroupstats_open,
4804  .read = seq_read,
4805  .llseek = seq_lseek,
4806  .release = single_release,
4807 };
4808 
4826 {
4827  task_lock(current);
4828  child->cgroups = current->cgroups;
4829  get_css_set(child->cgroups);
4830  task_unlock(current);
4831  INIT_LIST_HEAD(&child->cg_list);
4832 }
4833 
4843 {
4844  if (need_forkexit_callback) {
4845  int i;
4846  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847  struct cgroup_subsys *ss = subsys[i];
4848 
4849  /*
4850  * forkexit callbacks are only supported for
4851  * builtin subsystems.
4852  */
4853  if (!ss || ss->module)
4854  continue;
4855 
4856  if (ss->fork)
4857  ss->fork(child);
4858  }
4859  }
4860 }
4861 
4872 {
4873  /*
4874  * use_task_css_set_links is set to 1 before we walk the tasklist
4875  * under the tasklist_lock and we read it here after we added the child
4876  * to the tasklist under the tasklist_lock as well. If the child wasn't
4877  * yet in the tasklist when we walked through it from
4878  * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4879  * should be visible now due to the paired locking and barriers implied
4880  * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4881  * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4882  * lock on fork.
4883  */
4884  if (use_task_css_set_links) {
4885  write_lock(&css_set_lock);
4886  task_lock(child);
4887  if (list_empty(&child->cg_list))
4888  list_add(&child->cg_list, &child->cgroups->tasks);
4889  task_unlock(child);
4890  write_unlock(&css_set_lock);
4891  }
4892 }
4928 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4929 {
4930  struct css_set *cg;
4931  int i;
4932 
4933  /*
4934  * Unlink from the css_set task list if necessary.
4935  * Optimistically check cg_list before taking
4936  * css_set_lock
4937  */
4938  if (!list_empty(&tsk->cg_list)) {
4939  write_lock(&css_set_lock);
4940  if (!list_empty(&tsk->cg_list))
4941  list_del_init(&tsk->cg_list);
4942  write_unlock(&css_set_lock);
4943  }
4944 
4945  /* Reassign the task to the init_css_set. */
4946  task_lock(tsk);
4947  cg = tsk->cgroups;
4948  tsk->cgroups = &init_css_set;
4949 
4950  if (run_callbacks && need_forkexit_callback) {
4951  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4952  struct cgroup_subsys *ss = subsys[i];
4953 
4954  /* modular subsystems can't use callbacks */
4955  if (!ss || ss->module)
4956  continue;
4957 
4958  if (ss->exit) {
4959  struct cgroup *old_cgrp =
4960  rcu_dereference_raw(cg->subsys[i])->cgroup;
4961  struct cgroup *cgrp = task_cgroup(tsk, i);
4962  ss->exit(cgrp, old_cgrp, tsk);
4963  }
4964  }
4965  }
4966  task_unlock(tsk);
4967 
4968  if (cg)
4969  put_css_set_taskexit(cg);
4970 }
4971 
4985 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
4986 {
4987  int ret;
4988  struct cgroup *target;
4989 
4990  if (cgrp == dummytop)
4991  return 1;
4992 
4993  target = task_cgroup_from_root(task, cgrp->root);
4994  while (cgrp != target && cgrp!= cgrp->top_cgroup)
4995  cgrp = cgrp->parent;
4996  ret = (cgrp == target);
4997  return ret;
4998 }
4999 
5000 static void check_for_release(struct cgroup *cgrp)
5001 {
5002  /* All of these checks rely on RCU to keep the cgroup
5003  * structure alive */
5004  if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
5005  && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
5006  /* Control Group is currently removeable. If it's not
5007  * already queued for a userspace notification, queue
5008  * it now */
5009  int need_schedule_work = 0;
5010  raw_spin_lock(&release_list_lock);
5011  if (!cgroup_is_removed(cgrp) &&
5012  list_empty(&cgrp->release_list)) {
5013  list_add(&cgrp->release_list, &release_list);
5014  need_schedule_work = 1;
5015  }
5016  raw_spin_unlock(&release_list_lock);
5017  if (need_schedule_work)
5018  schedule_work(&release_agent_work);
5019  }
5020 }
5021 
5022 /* Caller must verify that the css is not for root cgroup */
5023 bool __css_tryget(struct cgroup_subsys_state *css)
5024 {
5025  do {
5026  int v = css_refcnt(css);
5027 
5028  if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
5029  return true;
5030  cpu_relax();
5031  } while (!test_bit(CSS_REMOVED, &css->flags));
5032 
5033  return false;
5034 }
5036 
5037 /* Caller must verify that the css is not for root cgroup */
5038 void __css_put(struct cgroup_subsys_state *css)
5039 {
5040  struct cgroup *cgrp = css->cgroup;
5041  int v;
5042 
5043  rcu_read_lock();
5044  v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5045 
5046  switch (v) {
5047  case 1:
5048  if (notify_on_release(cgrp)) {
5049  set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050  check_for_release(cgrp);
5051  }
5052  cgroup_wakeup_rmdir_waiter(cgrp);
5053  break;
5054  case 0:
5055  if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
5056  schedule_work(&css->dput_work);
5057  break;
5058  }
5059  rcu_read_unlock();
5060 }
5062 
5063 /*
5064  * Notify userspace when a cgroup is released, by running the
5065  * configured release agent with the name of the cgroup (path
5066  * relative to the root of cgroup file system) as the argument.
5067  *
5068  * Most likely, this user command will try to rmdir this cgroup.
5069  *
5070  * This races with the possibility that some other task will be
5071  * attached to this cgroup before it is removed, or that some other
5072  * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
5073  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5074  * unused, and this cgroup will be reprieved from its death sentence,
5075  * to continue to serve a useful existence. Next time it's released,
5076  * we will get notified again, if it still has 'notify_on_release' set.
5077  *
5078  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5079  * means only wait until the task is successfully execve()'d. The
5080  * separate release agent task is forked by call_usermodehelper(),
5081  * then control in this thread returns here, without waiting for the
5082  * release agent task. We don't bother to wait because the caller of
5083  * this routine has no use for the exit status of the release agent
5084  * task, so no sense holding our caller up for that.
5085  */
5086 static void cgroup_release_agent(struct work_struct *work)
5087 {
5088  BUG_ON(work != &release_agent_work);
5089  mutex_lock(&cgroup_mutex);
5090  raw_spin_lock(&release_list_lock);
5091  while (!list_empty(&release_list)) {
5092  char *argv[3], *envp[3];
5093  int i;
5094  char *pathbuf = NULL, *agentbuf = NULL;
5095  struct cgroup *cgrp = list_entry(release_list.next,
5096  struct cgroup,
5097  release_list);
5098  list_del_init(&cgrp->release_list);
5099  raw_spin_unlock(&release_list_lock);
5100  pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5101  if (!pathbuf)
5102  goto continue_free;
5103  if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
5104  goto continue_free;
5105  agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5106  if (!agentbuf)
5107  goto continue_free;
5108 
5109  i = 0;
5110  argv[i++] = agentbuf;
5111  argv[i++] = pathbuf;
5112  argv[i] = NULL;
5113 
5114  i = 0;
5115  /* minimal command environment */
5116  envp[i++] = "HOME=/";
5117  envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5118  envp[i] = NULL;
5119 
5120  /* Drop the lock while we invoke the usermode helper,
5121  * since the exec could involve hitting disk and hence
5122  * be a slow process */
5123  mutex_unlock(&cgroup_mutex);
5124  call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5125  mutex_lock(&cgroup_mutex);
5126  continue_free:
5127  kfree(pathbuf);
5128  kfree(agentbuf);
5129  raw_spin_lock(&release_list_lock);
5130  }
5131  raw_spin_unlock(&release_list_lock);
5132  mutex_unlock(&cgroup_mutex);
5133 }
5134 
5135 static int __init cgroup_disable(char *str)
5136 {
5137  int i;
5138  char *token;
5139 
5140  while ((token = strsep(&str, ",")) != NULL) {
5141  if (!*token)
5142  continue;
5143  for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5144  struct cgroup_subsys *ss = subsys[i];
5145 
5146  /*
5147  * cgroup_disable, being at boot time, can't
5148  * know about module subsystems, so we don't
5149  * worry about them.
5150  */
5151  if (!ss || ss->module)
5152  continue;
5153 
5154  if (!strcmp(token, ss->name)) {
5155  ss->disabled = 1;
5156  printk(KERN_INFO "Disabling %s control group"
5157  " subsystem\n", ss->name);
5158  break;
5159  }
5160  }
5161  }
5162  return 1;
5163 }
5164 __setup("cgroup_disable=", cgroup_disable);
5165 
5166 /*
5167  * Functons for CSS ID.
5168  */
5169 
5170 /*
5171  *To get ID other than 0, this should be called when !cgroup_is_removed().
5172  */
5173 unsigned short css_id(struct cgroup_subsys_state *css)
5174 {
5175  struct css_id *cssid;
5176 
5177  /*
5178  * This css_id() can return correct value when somone has refcnt
5179  * on this or this is under rcu_read_lock(). Once css->id is allocated,
5180  * it's unchanged until freed.
5181  */
5182  cssid = rcu_dereference_check(css->id, css_refcnt(css));
5183 
5184  if (cssid)
5185  return cssid->id;
5186  return 0;
5187 }
5189 
5190 unsigned short css_depth(struct cgroup_subsys_state *css)
5191 {
5192  struct css_id *cssid;
5193 
5194  cssid = rcu_dereference_check(css->id, css_refcnt(css));
5195 
5196  if (cssid)
5197  return cssid->depth;
5198  return 0;
5199 }
5201 
5215 bool css_is_ancestor(struct cgroup_subsys_state *child,
5216  const struct cgroup_subsys_state *root)
5217 {
5218  struct css_id *child_id;
5219  struct css_id *root_id;
5220 
5221  child_id = rcu_dereference(child->id);
5222  if (!child_id)
5223  return false;
5224  root_id = rcu_dereference(root->id);
5225  if (!root_id)
5226  return false;
5227  if (child_id->depth < root_id->depth)
5228  return false;
5229  if (child_id->stack[root_id->depth] != root_id->id)
5230  return false;
5231  return true;
5232 }
5233 
5234 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5235 {
5236  struct css_id *id = css->id;
5237  /* When this is called before css_id initialization, id can be NULL */
5238  if (!id)
5239  return;
5240 
5241  BUG_ON(!ss->use_id);
5242 
5243  rcu_assign_pointer(id->css, NULL);
5244  rcu_assign_pointer(css->id, NULL);
5245  spin_lock(&ss->id_lock);
5246  idr_remove(&ss->idr, id->id);
5247  spin_unlock(&ss->id_lock);
5248  kfree_rcu(id, rcu_head);
5249 }
5251 
5252 /*
5253  * This is called by init or create(). Then, calls to this function are
5254  * always serialized (By cgroup_mutex() at create()).
5255  */
5256 
5257 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5258 {
5259  struct css_id *newid;
5260  int myid, error, size;
5261 
5262  BUG_ON(!ss->use_id);
5263 
5264  size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
5265  newid = kzalloc(size, GFP_KERNEL);
5266  if (!newid)
5267  return ERR_PTR(-ENOMEM);
5268  /* get id */
5269  if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
5270  error = -ENOMEM;
5271  goto err_out;
5272  }
5273  spin_lock(&ss->id_lock);
5274  /* Don't use 0. allocates an ID of 1-65535 */
5275  error = idr_get_new_above(&ss->idr, newid, 1, &myid);
5276  spin_unlock(&ss->id_lock);
5277 
5278  /* Returns error when there are no free spaces for new ID.*/
5279  if (error) {
5280  error = -ENOSPC;
5281  goto err_out;
5282  }
5283  if (myid > CSS_ID_MAX)
5284  goto remove_idr;
5285 
5286  newid->id = myid;
5287  newid->depth = depth;
5288  return newid;
5289 remove_idr:
5290  error = -ENOSPC;
5291  spin_lock(&ss->id_lock);
5292  idr_remove(&ss->idr, myid);
5293  spin_unlock(&ss->id_lock);
5294 err_out:
5295  kfree(newid);
5296  return ERR_PTR(error);
5297 
5298 }
5299 
5300 static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5301  struct cgroup_subsys_state *rootcss)
5302 {
5303  struct css_id *newid;
5304 
5305  spin_lock_init(&ss->id_lock);
5306  idr_init(&ss->idr);
5307 
5308  newid = get_new_cssid(ss, 0);
5309  if (IS_ERR(newid))
5310  return PTR_ERR(newid);
5311 
5312  newid->stack[0] = newid->id;
5313  newid->css = rootcss;
5314  rootcss->id = newid;
5315  return 0;
5316 }
5317 
5318 static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5319  struct cgroup *child)
5320 {
5321  int subsys_id, i, depth = 0;
5322  struct cgroup_subsys_state *parent_css, *child_css;
5323  struct css_id *child_id, *parent_id;
5324 
5325  subsys_id = ss->subsys_id;
5326  parent_css = parent->subsys[subsys_id];
5327  child_css = child->subsys[subsys_id];
5328  parent_id = parent_css->id;
5329  depth = parent_id->depth + 1;
5330 
5331  child_id = get_new_cssid(ss, depth);
5332  if (IS_ERR(child_id))
5333  return PTR_ERR(child_id);
5334 
5335  for (i = 0; i < depth; i++)
5336  child_id->stack[i] = parent_id->stack[i];
5337  child_id->stack[depth] = child_id->id;
5338  /*
5339  * child_id->css pointer will be set after this cgroup is available
5340  * see cgroup_populate_dir()
5341  */
5342  rcu_assign_pointer(child_css->id, child_id);
5343 
5344  return 0;
5345 }
5346 
5355 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5356 {
5357  struct css_id *cssid = NULL;
5358 
5359  BUG_ON(!ss->use_id);
5360  cssid = idr_find(&ss->idr, id);
5361 
5362  if (unlikely(!cssid))
5363  return NULL;
5364 
5365  return rcu_dereference(cssid->css);
5366 }
5368 
5379 struct cgroup_subsys_state *
5380 css_get_next(struct cgroup_subsys *ss, int id,
5381  struct cgroup_subsys_state *root, int *foundid)
5382 {
5383  struct cgroup_subsys_state *ret = NULL;
5384  struct css_id *tmp;
5385  int tmpid;
5386  int rootid = css_id(root);
5387  int depth = css_depth(root);
5388 
5389  if (!rootid)
5390  return NULL;
5391 
5392  BUG_ON(!ss->use_id);
5393  WARN_ON_ONCE(!rcu_read_lock_held());
5394 
5395  /* fill start point for scan */
5396  tmpid = id;
5397  while (1) {
5398  /*
5399  * scan next entry from bitmap(tree), tmpid is updated after
5400  * idr_get_next().
5401  */
5402  tmp = idr_get_next(&ss->idr, &tmpid);
5403  if (!tmp)
5404  break;
5405  if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5406  ret = rcu_dereference(tmp->css);
5407  if (ret) {
5408  *foundid = tmpid;
5409  break;
5410  }
5411  }
5412  /* continue to scan from next id */
5413  tmpid = tmpid + 1;
5414  }
5415  return ret;
5416 }
5417 
5418 /*
5419  * get corresponding css from file open on cgroupfs directory
5420  */
5421 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5422 {
5423  struct cgroup *cgrp;
5424  struct inode *inode;
5425  struct cgroup_subsys_state *css;
5426 
5427  inode = f->f_dentry->d_inode;
5428  /* check in cgroup filesystem dir */
5429  if (inode->i_op != &cgroup_dir_inode_operations)
5430  return ERR_PTR(-EBADF);
5431 
5432  if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5433  return ERR_PTR(-EINVAL);
5434 
5435  /* get cgroup */
5436  cgrp = __d_cgrp(f->f_dentry);
5437  css = cgrp->subsys[id];
5438  return css ? css : ERR_PTR(-ENOENT);
5439 }
5440 
5441 #ifdef CONFIG_CGROUP_DEBUG
5442 static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5443 {
5444  struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 
5446  if (!css)
5447  return ERR_PTR(-ENOMEM);
5448 
5449  return css;
5450 }
5451 
5452 static void debug_destroy(struct cgroup *cont)
5453 {
5454  kfree(cont->subsys[debug_subsys_id]);
5455 }
5456 
5457 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5458 {
5459  return atomic_read(&cont->count);
5460 }
5461 
5462 static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
5463 {
5464  return cgroup_task_count(cont);
5465 }
5466 
5467 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
5468 {
5469  return (u64)(unsigned long)current->cgroups;
5470 }
5471 
5472 static u64 current_css_set_refcount_read(struct cgroup *cont,
5473  struct cftype *cft)
5474 {
5475  u64 count;
5476 
5477  rcu_read_lock();
5478  count = atomic_read(&current->cgroups->refcount);
5479  rcu_read_unlock();
5480  return count;
5481 }
5482 
5483 static int current_css_set_cg_links_read(struct cgroup *cont,
5484  struct cftype *cft,
5485  struct seq_file *seq)
5486 {
5487  struct cg_cgroup_link *link;
5488  struct css_set *cg;
5489 
5490  read_lock(&css_set_lock);
5491  rcu_read_lock();
5492  cg = rcu_dereference(current->cgroups);
5493  list_for_each_entry(link, &cg->cg_links, cg_link_list) {
5494  struct cgroup *c = link->cgrp;
5495  const char *name;
5496 
5497  if (c->dentry)
5498  name = c->dentry->d_name.name;
5499  else
5500  name = "?";
5501  seq_printf(seq, "Root %d group %s\n",
5502  c->root->hierarchy_id, name);
5503  }
5504  rcu_read_unlock();
5505  read_unlock(&css_set_lock);
5506  return 0;
5507 }
5508 
5509 #define MAX_TASKS_SHOWN_PER_CSS 25
5510 static int cgroup_css_links_read(struct cgroup *cont,
5511  struct cftype *cft,
5512  struct seq_file *seq)
5513 {
5514  struct cg_cgroup_link *link;
5515 
5516  read_lock(&css_set_lock);
5517  list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
5518  struct css_set *cg = link->cg;
5519  struct task_struct *task;
5520  int count = 0;
5521  seq_printf(seq, "css_set %p\n", cg);
5522  list_for_each_entry(task, &cg->tasks, cg_list) {
5523  if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5524  seq_puts(seq, " ...\n");
5525  break;
5526  } else {
5527  seq_printf(seq, " task %d\n",
5528  task_pid_vnr(task));
5529  }
5530  }
5531  }
5532  read_unlock(&css_set_lock);
5533  return 0;
5534 }
5535 
5536 static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5537 {
5538  return test_bit(CGRP_RELEASABLE, &cgrp->flags);
5539 }
5540 
5541 static struct cftype debug_files[] = {
5542  {
5543  .name = "cgroup_refcount",
5544  .read_u64 = cgroup_refcount_read,
5545  },
5546  {
5547  .name = "taskcount",
5548  .read_u64 = debug_taskcount_read,
5549  },
5550 
5551  {
5552  .name = "current_css_set",
5553  .read_u64 = current_css_set_read,
5554  },
5555 
5556  {
5557  .name = "current_css_set_refcount",
5558  .read_u64 = current_css_set_refcount_read,
5559  },
5560 
5561  {
5562  .name = "current_css_set_cg_links",
5563  .read_seq_string = current_css_set_cg_links_read,
5564  },
5565 
5566  {
5567  .name = "cgroup_css_links",
5568  .read_seq_string = cgroup_css_links_read,
5569  },
5570 
5571  {
5572  .name = "releasable",
5573  .read_u64 = releasable_read,
5574  },
5575 
5576  { } /* terminate */
5577 };
5578 
5579 struct cgroup_subsys debug_subsys = {
5580  .name = "debug",
5581  .create = debug_create,
5582  .destroy = debug_destroy,
5583  .subsys_id = debug_subsys_id,
5584  .base_cftypes = debug_files,
5585 };
5586 #endif /* CONFIG_CGROUP_DEBUG */