Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
device_cgroup.c
Go to the documentation of this file.
1 /*
2  * device_cgroup.c - device cgroup subsystem
3  *
4  * Copyright 2007 IBM Corp
5  */
6 
7 #include <linux/device_cgroup.h>
8 #include <linux/cgroup.h>
9 #include <linux/ctype.h>
10 #include <linux/list.h>
11 #include <linux/uaccess.h>
12 #include <linux/seq_file.h>
13 #include <linux/slab.h>
14 #include <linux/rcupdate.h>
15 #include <linux/mutex.h>
16 
17 #define ACC_MKNOD 1
18 #define ACC_READ 2
19 #define ACC_WRITE 4
20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
21 
22 #define DEV_BLOCK 1
23 #define DEV_CHAR 2
24 #define DEV_ALL 4 /* this represents all devices */
25 
26 static DEFINE_MUTEX(devcgroup_mutex);
27 
28 /*
29  * exception list locking rules:
30  * hold devcgroup_mutex for update/read.
31  * hold rcu_read_lock() for read.
32  */
33 
36  short type;
37  short access;
38  struct list_head list;
39  struct rcu_head rcu;
40 };
41 
42 struct dev_cgroup {
43  struct cgroup_subsys_state css;
45  enum {
48  } behavior;
49 };
50 
51 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
52 {
53  return container_of(s, struct dev_cgroup, css);
54 }
55 
56 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
57 {
58  return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
59 }
60 
61 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
62 {
63  return css_to_devcgroup(task_subsys_state(task, devices_subsys_id));
64 }
65 
66 struct cgroup_subsys devices_subsys;
67 
68 static int devcgroup_can_attach(struct cgroup *new_cgrp,
69  struct cgroup_taskset *set)
70 {
71  struct task_struct *task = cgroup_taskset_first(set);
72 
73  if (current != task && !capable(CAP_SYS_ADMIN))
74  return -EPERM;
75  return 0;
76 }
77 
78 /*
79  * called under devcgroup_mutex
80  */
81 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
82 {
83  struct dev_exception_item *ex, *tmp, *new;
84 
85  list_for_each_entry(ex, orig, list) {
86  new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
87  if (!new)
88  goto free_and_exit;
89  list_add_tail(&new->list, dest);
90  }
91 
92  return 0;
93 
94 free_and_exit:
95  list_for_each_entry_safe(ex, tmp, dest, list) {
96  list_del(&ex->list);
97  kfree(ex);
98  }
99  return -ENOMEM;
100 }
101 
102 /*
103  * called under devcgroup_mutex
104  */
105 static int dev_exception_add(struct dev_cgroup *dev_cgroup,
106  struct dev_exception_item *ex)
107 {
108  struct dev_exception_item *excopy, *walk;
109 
110  excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
111  if (!excopy)
112  return -ENOMEM;
113 
114  list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
115  if (walk->type != ex->type)
116  continue;
117  if (walk->major != ex->major)
118  continue;
119  if (walk->minor != ex->minor)
120  continue;
121 
122  walk->access |= ex->access;
123  kfree(excopy);
124  excopy = NULL;
125  }
126 
127  if (excopy != NULL)
128  list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
129  return 0;
130 }
131 
132 /*
133  * called under devcgroup_mutex
134  */
135 static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
136  struct dev_exception_item *ex)
137 {
138  struct dev_exception_item *walk, *tmp;
139 
140  list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
141  if (walk->type != ex->type)
142  continue;
143  if (walk->major != ex->major)
144  continue;
145  if (walk->minor != ex->minor)
146  continue;
147 
148  walk->access &= ~ex->access;
149  if (!walk->access) {
150  list_del_rcu(&walk->list);
151  kfree_rcu(walk, rcu);
152  }
153  }
154 }
155 
162 static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
163 {
164  struct dev_exception_item *ex, *tmp;
165 
166  list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
167  list_del_rcu(&ex->list);
168  kfree_rcu(ex, rcu);
169  }
170 }
171 
172 /*
173  * called from kernel/cgroup.c with cgroup_lock() held.
174  */
175 static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
176 {
177  struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
178  struct cgroup *parent_cgroup;
179  int ret;
180 
181  dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
182  if (!dev_cgroup)
183  return ERR_PTR(-ENOMEM);
184  INIT_LIST_HEAD(&dev_cgroup->exceptions);
185  parent_cgroup = cgroup->parent;
186 
187  if (parent_cgroup == NULL)
188  dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
189  else {
190  parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
191  mutex_lock(&devcgroup_mutex);
192  ret = dev_exceptions_copy(&dev_cgroup->exceptions,
193  &parent_dev_cgroup->exceptions);
194  dev_cgroup->behavior = parent_dev_cgroup->behavior;
195  mutex_unlock(&devcgroup_mutex);
196  if (ret) {
197  kfree(dev_cgroup);
198  return ERR_PTR(ret);
199  }
200  }
201 
202  return &dev_cgroup->css;
203 }
204 
205 static void devcgroup_destroy(struct cgroup *cgroup)
206 {
207  struct dev_cgroup *dev_cgroup;
208 
209  dev_cgroup = cgroup_to_devcgroup(cgroup);
210  dev_exception_clean(dev_cgroup);
211  kfree(dev_cgroup);
212 }
213 
214 #define DEVCG_ALLOW 1
215 #define DEVCG_DENY 2
216 #define DEVCG_LIST 3
217 
218 #define MAJMINLEN 13
219 #define ACCLEN 4
220 
221 static void set_access(char *acc, short access)
222 {
223  int idx = 0;
224  memset(acc, 0, ACCLEN);
225  if (access & ACC_READ)
226  acc[idx++] = 'r';
227  if (access & ACC_WRITE)
228  acc[idx++] = 'w';
229  if (access & ACC_MKNOD)
230  acc[idx++] = 'm';
231 }
232 
233 static char type_to_char(short type)
234 {
235  if (type == DEV_ALL)
236  return 'a';
237  if (type == DEV_CHAR)
238  return 'c';
239  if (type == DEV_BLOCK)
240  return 'b';
241  return 'X';
242 }
243 
244 static void set_majmin(char *str, unsigned m)
245 {
246  if (m == ~0)
247  strcpy(str, "*");
248  else
249  sprintf(str, "%u", m);
250 }
251 
252 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
253  struct seq_file *m)
254 {
255  struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
256  struct dev_exception_item *ex;
257  char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
258 
259  rcu_read_lock();
260  /*
261  * To preserve the compatibility:
262  * - Only show the "all devices" when the default policy is to allow
263  * - List the exceptions in case the default policy is to deny
264  * This way, the file remains as a "whitelist of devices"
265  */
266  if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
267  set_access(acc, ACC_MASK);
268  set_majmin(maj, ~0);
269  set_majmin(min, ~0);
270  seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
271  maj, min, acc);
272  } else {
273  list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
274  set_access(acc, ex->access);
275  set_majmin(maj, ex->major);
276  set_majmin(min, ex->minor);
277  seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
278  maj, min, acc);
279  }
280  }
281  rcu_read_unlock();
282 
283  return 0;
284 }
285 
295 static int may_access(struct dev_cgroup *dev_cgroup,
296  struct dev_exception_item *refex)
297 {
298  struct dev_exception_item *ex;
299  bool match = false;
300 
301  list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
302  if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
303  continue;
304  if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
305  continue;
306  if (ex->major != ~0 && ex->major != refex->major)
307  continue;
308  if (ex->minor != ~0 && ex->minor != refex->minor)
309  continue;
310  if (refex->access & (~ex->access))
311  continue;
312  match = true;
313  break;
314  }
315 
316  /*
317  * In two cases we'll consider this new exception valid:
318  * - the dev cgroup has its default policy to allow + exception list:
319  * the new exception should *not* match any of the exceptions
320  * (behavior == DEVCG_DEFAULT_ALLOW, !match)
321  * - the dev cgroup has its default policy to deny + exception list:
322  * the new exception *should* match the exceptions
323  * (behavior == DEVCG_DEFAULT_DENY, match)
324  */
325  if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match)
326  return 1;
327  return 0;
328 }
329 
330 /*
331  * parent_has_perm:
332  * when adding a new allow rule to a device exception list, the rule
333  * must be allowed in the parent device
334  */
335 static int parent_has_perm(struct dev_cgroup *childcg,
336  struct dev_exception_item *ex)
337 {
338  struct cgroup *pcg = childcg->css.cgroup->parent;
339  struct dev_cgroup *parent;
340 
341  if (!pcg)
342  return 1;
343  parent = cgroup_to_devcgroup(pcg);
344  return may_access(parent, ex);
345 }
346 
353 static inline int may_allow_all(struct dev_cgroup *parent)
354 {
355  if (!parent)
356  return 1;
357  return parent->behavior == DEVCG_DEFAULT_ALLOW;
358 }
359 
360 /*
361  * Modify the exception list using allow/deny rules.
362  * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
363  * so we can give a container CAP_MKNOD to let it create devices but not
364  * modify the exception list.
365  * It seems likely we'll want to add a CAP_CONTAINER capability to allow
366  * us to also grant CAP_SYS_ADMIN to containers without giving away the
367  * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
368  *
369  * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting
370  * new access is only allowed if you're in the top-level cgroup, or your
371  * parent cgroup has the access you're asking for.
372  */
373 static int devcgroup_update_access(struct dev_cgroup *devcgroup,
374  int filetype, const char *buffer)
375 {
376  const char *b;
377  char temp[12]; /* 11 + 1 characters needed for a u32 */
378  int count, rc;
379  struct dev_exception_item ex;
380  struct cgroup *p = devcgroup->css.cgroup;
381  struct dev_cgroup *parent = NULL;
382 
383  if (!capable(CAP_SYS_ADMIN))
384  return -EPERM;
385 
386  if (p->parent)
387  parent = cgroup_to_devcgroup(p->parent);
388 
389  memset(&ex, 0, sizeof(ex));
390  b = buffer;
391 
392  switch (*b) {
393  case 'a':
394  switch (filetype) {
395  case DEVCG_ALLOW:
396  if (!may_allow_all(parent))
397  return -EPERM;
398  dev_exception_clean(devcgroup);
399  devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
400  if (!parent)
401  break;
402 
403  rc = dev_exceptions_copy(&devcgroup->exceptions,
404  &parent->exceptions);
405  if (rc)
406  return rc;
407  break;
408  case DEVCG_DENY:
409  dev_exception_clean(devcgroup);
410  devcgroup->behavior = DEVCG_DEFAULT_DENY;
411  break;
412  default:
413  return -EINVAL;
414  }
415  return 0;
416  case 'b':
417  ex.type = DEV_BLOCK;
418  break;
419  case 'c':
420  ex.type = DEV_CHAR;
421  break;
422  default:
423  return -EINVAL;
424  }
425  b++;
426  if (!isspace(*b))
427  return -EINVAL;
428  b++;
429  if (*b == '*') {
430  ex.major = ~0;
431  b++;
432  } else if (isdigit(*b)) {
433  memset(temp, 0, sizeof(temp));
434  for (count = 0; count < sizeof(temp) - 1; count++) {
435  temp[count] = *b;
436  b++;
437  if (!isdigit(*b))
438  break;
439  }
440  rc = kstrtou32(temp, 10, &ex.major);
441  if (rc)
442  return -EINVAL;
443  } else {
444  return -EINVAL;
445  }
446  if (*b != ':')
447  return -EINVAL;
448  b++;
449 
450  /* read minor */
451  if (*b == '*') {
452  ex.minor = ~0;
453  b++;
454  } else if (isdigit(*b)) {
455  memset(temp, 0, sizeof(temp));
456  for (count = 0; count < sizeof(temp) - 1; count++) {
457  temp[count] = *b;
458  b++;
459  if (!isdigit(*b))
460  break;
461  }
462  rc = kstrtou32(temp, 10, &ex.minor);
463  if (rc)
464  return -EINVAL;
465  } else {
466  return -EINVAL;
467  }
468  if (!isspace(*b))
469  return -EINVAL;
470  for (b++, count = 0; count < 3; count++, b++) {
471  switch (*b) {
472  case 'r':
473  ex.access |= ACC_READ;
474  break;
475  case 'w':
476  ex.access |= ACC_WRITE;
477  break;
478  case 'm':
479  ex.access |= ACC_MKNOD;
480  break;
481  case '\n':
482  case '\0':
483  count = 3;
484  break;
485  default:
486  return -EINVAL;
487  }
488  }
489 
490  switch (filetype) {
491  case DEVCG_ALLOW:
492  if (!parent_has_perm(devcgroup, &ex))
493  return -EPERM;
494  /*
495  * If the default policy is to allow by default, try to remove
496  * an matching exception instead. And be silent about it: we
497  * don't want to break compatibility
498  */
499  if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
500  dev_exception_rm(devcgroup, &ex);
501  return 0;
502  }
503  return dev_exception_add(devcgroup, &ex);
504  case DEVCG_DENY:
505  /*
506  * If the default policy is to deny by default, try to remove
507  * an matching exception instead. And be silent about it: we
508  * don't want to break compatibility
509  */
510  if (devcgroup->behavior == DEVCG_DEFAULT_DENY) {
511  dev_exception_rm(devcgroup, &ex);
512  return 0;
513  }
514  return dev_exception_add(devcgroup, &ex);
515  default:
516  return -EINVAL;
517  }
518  return 0;
519 }
520 
521 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
522  const char *buffer)
523 {
524  int retval;
525 
526  mutex_lock(&devcgroup_mutex);
527  retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp),
528  cft->private, buffer);
529  mutex_unlock(&devcgroup_mutex);
530  return retval;
531 }
532 
533 static struct cftype dev_cgroup_files[] = {
534  {
535  .name = "allow",
536  .write_string = devcgroup_access_write,
537  .private = DEVCG_ALLOW,
538  },
539  {
540  .name = "deny",
541  .write_string = devcgroup_access_write,
542  .private = DEVCG_DENY,
543  },
544  {
545  .name = "list",
546  .read_seq_string = devcgroup_seq_read,
547  .private = DEVCG_LIST,
548  },
549  { } /* terminate */
550 };
551 
552 struct cgroup_subsys devices_subsys = {
553  .name = "devices",
554  .can_attach = devcgroup_can_attach,
555  .create = devcgroup_create,
556  .destroy = devcgroup_destroy,
557  .subsys_id = devices_subsys_id,
558  .base_cftypes = dev_cgroup_files,
559 
560  /*
561  * While devices cgroup has the rudimentary hierarchy support which
562  * checks the parent's restriction, it doesn't properly propagates
563  * config changes in ancestors to their descendents. A child
564  * should only be allowed to add more restrictions to the parent's
565  * configuration. Fix it and remove the following.
566  */
567  .broken_hierarchy = true,
568 };
569 
580 static int __devcgroup_check_permission(short type, u32 major, u32 minor,
581  short access)
582 {
583  struct dev_cgroup *dev_cgroup;
584  struct dev_exception_item ex;
585  int rc;
586 
587  memset(&ex, 0, sizeof(ex));
588  ex.type = type;
589  ex.major = major;
590  ex.minor = minor;
591  ex.access = access;
592 
593  rcu_read_lock();
594  dev_cgroup = task_devcgroup(current);
595  rc = may_access(dev_cgroup, &ex);
596  rcu_read_unlock();
597 
598  if (!rc)
599  return -EPERM;
600 
601  return 0;
602 }
603 
605 {
606  short type, access = 0;
607 
608  if (S_ISBLK(inode->i_mode))
609  type = DEV_BLOCK;
610  if (S_ISCHR(inode->i_mode))
611  type = DEV_CHAR;
612  if (mask & MAY_WRITE)
613  access |= ACC_WRITE;
614  if (mask & MAY_READ)
615  access |= ACC_READ;
616 
617  return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
618  access);
619 }
620 
622 {
623  short type;
624 
625  if (!S_ISBLK(mode) && !S_ISCHR(mode))
626  return 0;
627 
628  if (S_ISBLK(mode))
629  type = DEV_BLOCK;
630  else
631  type = DEV_CHAR;
632 
633  return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
634  ACC_MKNOD);
635 
636 }