Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
super.c
Go to the documentation of this file.
1 /*
2  * linux/fs/super.c
3  *
4  * Copyright (C) 1991, 1992 Linus Torvalds
5  *
6  * super.c contains code to handle: - mount structures
7  * - super-block tables
8  * - filesystem drivers list
9  * - mount system call
10  * - umount system call
11  * - ustat system call
12  *
13  * GK 2/5/95 - Changed to support mounting the root fs via NFS
14  *
15  * Added kerneld support: Jacques Gelinas and Bjorn Ekwall
16  * Added change_root: Werner Almesberger & Hans Lermen, Feb '96
17  * Added options to /proc/mounts:
18  * Torbjörn Lindh ([email protected]), April 14, 1996.
19  * Added devfs support: Richard Gooch <[email protected]>, 13-JAN-1998
20  * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
21  */
22 
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/acct.h>
26 #include <linux/blkdev.h>
27 #include <linux/mount.h>
28 #include <linux/security.h>
29 #include <linux/writeback.h> /* for the emergency remount stuff */
30 #include <linux/idr.h>
31 #include <linux/mutex.h>
32 #include <linux/backing-dev.h>
33 #include <linux/rculist_bl.h>
34 #include <linux/cleancache.h>
35 #include <linux/fsnotify.h>
36 #include <linux/lockdep.h>
37 #include "internal.h"
38 
39 
40 LIST_HEAD(super_blocks);
41 DEFINE_SPINLOCK(sb_lock);
42 
43 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
44  "sb_writers",
45  "sb_pagefaults",
46  "sb_internal",
47 };
48 
49 /*
50  * One thing we have to be careful of with a per-sb shrinker is that we don't
51  * drop the last active reference to the superblock from within the shrinker.
52  * If that happens we could trigger unregistering the shrinker from within the
53  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
54  * take a passive reference to the superblock to avoid this from occurring.
55  */
56 static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
57 {
58  struct super_block *sb;
59  int fs_objects = 0;
60  int total_objects;
61 
62  sb = container_of(shrink, struct super_block, s_shrink);
63 
64  /*
65  * Deadlock avoidance. We may hold various FS locks, and we don't want
66  * to recurse into the FS that called us in clear_inode() and friends..
67  */
68  if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
69  return -1;
70 
71  if (!grab_super_passive(sb))
72  return -1;
73 
74  if (sb->s_op && sb->s_op->nr_cached_objects)
75  fs_objects = sb->s_op->nr_cached_objects(sb);
76 
77  total_objects = sb->s_nr_dentry_unused +
78  sb->s_nr_inodes_unused + fs_objects + 1;
79 
80  if (sc->nr_to_scan) {
81  int dentries;
82  int inodes;
83 
84  /* proportion the scan between the caches */
85  dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
86  total_objects;
87  inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
88  total_objects;
89  if (fs_objects)
90  fs_objects = (sc->nr_to_scan * fs_objects) /
91  total_objects;
92  /*
93  * prune the dcache first as the icache is pinned by it, then
94  * prune the icache, followed by the filesystem specific caches
95  */
96  prune_dcache_sb(sb, dentries);
97  prune_icache_sb(sb, inodes);
98 
99  if (fs_objects && sb->s_op->free_cached_objects) {
100  sb->s_op->free_cached_objects(sb, fs_objects);
101  fs_objects = sb->s_op->nr_cached_objects(sb);
102  }
103  total_objects = sb->s_nr_dentry_unused +
104  sb->s_nr_inodes_unused + fs_objects;
105  }
106 
107  total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
108  drop_super(sb);
109  return total_objects;
110 }
111 
112 static int init_sb_writers(struct super_block *s, struct file_system_type *type)
113 {
114  int err;
115  int i;
116 
117  for (i = 0; i < SB_FREEZE_LEVELS; i++) {
118  err = percpu_counter_init(&s->s_writers.counter[i], 0);
119  if (err < 0)
120  goto err_out;
121  lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
122  &type->s_writers_key[i], 0);
123  }
124  init_waitqueue_head(&s->s_writers.wait);
125  init_waitqueue_head(&s->s_writers.wait_unfrozen);
126  return 0;
127 err_out:
128  while (--i >= 0)
129  percpu_counter_destroy(&s->s_writers.counter[i]);
130  return err;
131 }
132 
133 static void destroy_sb_writers(struct super_block *s)
134 {
135  int i;
136 
137  for (i = 0; i < SB_FREEZE_LEVELS; i++)
138  percpu_counter_destroy(&s->s_writers.counter[i]);
139 }
140 
149 static struct super_block *alloc_super(struct file_system_type *type, int flags)
150 {
151  struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
152  static const struct super_operations default_op;
153 
154  if (s) {
155  if (security_sb_alloc(s)) {
156  /*
157  * We cannot call security_sb_free() without
158  * security_sb_alloc() succeeding. So bail out manually
159  */
160  kfree(s);
161  s = NULL;
162  goto out;
163  }
164 #ifdef CONFIG_SMP
165  s->s_files = alloc_percpu(struct list_head);
166  if (!s->s_files)
167  goto err_out;
168  else {
169  int i;
170 
172  INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
173  }
174 #else
175  INIT_LIST_HEAD(&s->s_files);
176 #endif
177  if (init_sb_writers(s, type))
178  goto err_out;
179  s->s_flags = flags;
181  INIT_HLIST_NODE(&s->s_instances);
183  INIT_LIST_HEAD(&s->s_inodes);
184  INIT_LIST_HEAD(&s->s_dentry_lru);
185  INIT_LIST_HEAD(&s->s_inode_lru);
186  spin_lock_init(&s->s_inode_lru_lock);
187  INIT_LIST_HEAD(&s->s_mounts);
188  init_rwsem(&s->s_umount);
190  /*
191  * sget() can have s_umount recursion.
192  *
193  * When it cannot find a suitable sb, it allocates a new
194  * one (this one), and tries again to find a suitable old
195  * one.
196  *
197  * In case that succeeds, it will acquire the s_umount
198  * lock of the old one. Since these are clearly distrinct
199  * locks, and this object isn't exposed yet, there's no
200  * risk of deadlocks.
201  *
202  * Annotate this by putting this lock in a different
203  * subclass.
204  */
206  s->s_count = 1;
207  atomic_set(&s->s_active, 1);
210  mutex_init(&s->s_dquot.dqio_mutex);
211  mutex_init(&s->s_dquot.dqonoff_mutex);
212  init_rwsem(&s->s_dquot.dqptr_sem);
213  s->s_maxbytes = MAX_NON_LFS;
214  s->s_op = &default_op;
215  s->s_time_gran = 1000000000;
216  s->cleancache_poolid = -1;
217 
218  s->s_shrink.seeks = DEFAULT_SEEKS;
219  s->s_shrink.shrink = prune_super;
220  s->s_shrink.batch = 1024;
221  }
222 out:
223  return s;
224 err_out:
225  security_sb_free(s);
226 #ifdef CONFIG_SMP
227  if (s->s_files)
228  free_percpu(s->s_files);
229 #endif
230  destroy_sb_writers(s);
231  kfree(s);
232  s = NULL;
233  goto out;
234 }
235 
242 static inline void destroy_super(struct super_block *s)
243 {
244 #ifdef CONFIG_SMP
245  free_percpu(s->s_files);
246 #endif
247  destroy_sb_writers(s);
248  security_sb_free(s);
249  WARN_ON(!list_empty(&s->s_mounts));
250  kfree(s->s_subtype);
251  kfree(s->s_options);
252  kfree(s);
253 }
254 
255 /* Superblock refcounting */
256 
257 /*
258  * Drop a superblock's refcount. The caller must hold sb_lock.
259  */
260 static void __put_super(struct super_block *sb)
261 {
262  if (!--sb->s_count) {
263  list_del_init(&sb->s_list);
264  destroy_super(sb);
265  }
266 }
267 
275 static void put_super(struct super_block *sb)
276 {
277  spin_lock(&sb_lock);
278  __put_super(sb);
279  spin_unlock(&sb_lock);
280 }
281 
282 
295 {
296  struct file_system_type *fs = s->s_type;
297  if (atomic_dec_and_test(&s->s_active)) {
298  cleancache_invalidate_fs(s);
299  fs->kill_sb(s);
300 
301  /* caches are now gone, we can safely kill the shrinker now */
303  put_filesystem(fs);
304  put_super(s);
305  } else {
306  up_write(&s->s_umount);
307  }
308 }
309 
311 
321 {
322  if (!atomic_add_unless(&s->s_active, -1, 1)) {
323  down_write(&s->s_umount);
325  }
326 }
327 
329 
341 static int grab_super(struct super_block *s) __releases(sb_lock)
342 {
343  if (atomic_inc_not_zero(&s->s_active)) {
344  spin_unlock(&sb_lock);
345  return 1;
346  }
347  /* it's going away */
348  s->s_count++;
349  spin_unlock(&sb_lock);
350  /* wait for it to die */
351  down_write(&s->s_umount);
352  up_write(&s->s_umount);
353  put_super(s);
354  return 0;
355 }
356 
357 /*
358  * grab_super_passive - acquire a passive reference
359  * @sb: reference we are trying to grab
360  *
361  * Tries to acquire a passive reference. This is used in places where we
362  * cannot take an active reference but we need to ensure that the
363  * superblock does not go away while we are working on it. It returns
364  * false if a reference was not gained, and returns true with the s_umount
365  * lock held in read mode if a reference is gained. On successful return,
366  * the caller must drop the s_umount lock and the passive reference when
367  * done.
368  */
370 {
371  spin_lock(&sb_lock);
372  if (hlist_unhashed(&sb->s_instances)) {
373  spin_unlock(&sb_lock);
374  return false;
375  }
376 
377  sb->s_count++;
378  spin_unlock(&sb_lock);
379 
380  if (down_read_trylock(&sb->s_umount)) {
381  if (sb->s_root && (sb->s_flags & MS_BORN))
382  return true;
383  up_read(&sb->s_umount);
384  }
385 
386  put_super(sb);
387  return false;
388 }
389 
405 {
406  const struct super_operations *sop = sb->s_op;
407 
408  if (sb->s_root) {
410  sync_filesystem(sb);
411  sb->s_flags &= ~MS_ACTIVE;
412 
414 
415  evict_inodes(sb);
416 
417  if (sop->put_super)
418  sop->put_super(sb);
419 
420  if (!list_empty(&sb->s_inodes)) {
421  printk("VFS: Busy inodes after unmount of %s. "
422  "Self-destruct in 5 seconds. Have a nice day...\n",
423  sb->s_id);
424  }
425  }
426  spin_lock(&sb_lock);
427  /* should be initialized for __put_super_and_need_restart() */
428  hlist_del_init(&sb->s_instances);
429  spin_unlock(&sb_lock);
430  up_write(&sb->s_umount);
431 }
432 
434 
443 struct super_block *sget(struct file_system_type *type,
444  int (*test)(struct super_block *,void *),
445  int (*set)(struct super_block *,void *),
446  int flags,
447  void *data)
448 {
449  struct super_block *s = NULL;
450  struct hlist_node *node;
451  struct super_block *old;
452  int err;
453 
454 retry:
455  spin_lock(&sb_lock);
456  if (test) {
457  hlist_for_each_entry(old, node, &type->fs_supers, s_instances) {
458  if (!test(old, data))
459  continue;
460  if (!grab_super(old))
461  goto retry;
462  if (s) {
463  up_write(&s->s_umount);
464  destroy_super(s);
465  s = NULL;
466  }
467  down_write(&old->s_umount);
468  if (unlikely(!(old->s_flags & MS_BORN))) {
470  goto retry;
471  }
472  return old;
473  }
474  }
475  if (!s) {
476  spin_unlock(&sb_lock);
477  s = alloc_super(type, flags);
478  if (!s)
479  return ERR_PTR(-ENOMEM);
480  goto retry;
481  }
482 
483  err = set(s, data);
484  if (err) {
485  spin_unlock(&sb_lock);
486  up_write(&s->s_umount);
487  destroy_super(s);
488  return ERR_PTR(err);
489  }
490  s->s_type = type;
491  strlcpy(s->s_id, type->name, sizeof(s->s_id));
493  hlist_add_head(&s->s_instances, &type->fs_supers);
494  spin_unlock(&sb_lock);
495  get_filesystem(type);
497  return s;
498 }
499 
501 
502 void drop_super(struct super_block *sb)
503 {
504  up_read(&sb->s_umount);
505  put_super(sb);
506 }
507 
509 
518 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
519 {
520  struct super_block *sb, *p = NULL;
521 
522  spin_lock(&sb_lock);
524  if (hlist_unhashed(&sb->s_instances))
525  continue;
526  sb->s_count++;
527  spin_unlock(&sb_lock);
528 
529  down_read(&sb->s_umount);
530  if (sb->s_root && (sb->s_flags & MS_BORN))
531  f(sb, arg);
532  up_read(&sb->s_umount);
533 
534  spin_lock(&sb_lock);
535  if (p)
536  __put_super(p);
537  p = sb;
538  }
539  if (p)
540  __put_super(p);
541  spin_unlock(&sb_lock);
542 }
543 
554  void (*f)(struct super_block *, void *), void *arg)
555 {
556  struct super_block *sb, *p = NULL;
557  struct hlist_node *node;
558 
559  spin_lock(&sb_lock);
560  hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) {
561  sb->s_count++;
562  spin_unlock(&sb_lock);
563 
564  down_read(&sb->s_umount);
565  if (sb->s_root && (sb->s_flags & MS_BORN))
566  f(sb, arg);
567  up_read(&sb->s_umount);
568 
569  spin_lock(&sb_lock);
570  if (p)
571  __put_super(p);
572  p = sb;
573  }
574  if (p)
575  __put_super(p);
576  spin_unlock(&sb_lock);
577 }
578 
580 
589 struct super_block *get_super(struct block_device *bdev)
590 {
591  struct super_block *sb;
592 
593  if (!bdev)
594  return NULL;
595 
596  spin_lock(&sb_lock);
597 rescan:
599  if (hlist_unhashed(&sb->s_instances))
600  continue;
601  if (sb->s_bdev == bdev) {
602  sb->s_count++;
603  spin_unlock(&sb_lock);
604  down_read(&sb->s_umount);
605  /* still alive? */
606  if (sb->s_root && (sb->s_flags & MS_BORN))
607  return sb;
608  up_read(&sb->s_umount);
609  /* nope, got unmounted */
610  spin_lock(&sb_lock);
611  __put_super(sb);
612  goto rescan;
613  }
614  }
615  spin_unlock(&sb_lock);
616  return NULL;
617 }
618 
620 
631 {
632  while (1) {
633  struct super_block *s = get_super(bdev);
634  if (!s || s->s_writers.frozen == SB_UNFROZEN)
635  return s;
636  up_read(&s->s_umount);
637  wait_event(s->s_writers.wait_unfrozen,
638  s->s_writers.frozen == SB_UNFROZEN);
639  put_super(s);
640  }
641 }
643 
653 {
654  struct super_block *sb;
655 
656  if (!bdev)
657  return NULL;
658 
659 restart:
660  spin_lock(&sb_lock);
662  if (hlist_unhashed(&sb->s_instances))
663  continue;
664  if (sb->s_bdev == bdev) {
665  if (grab_super(sb)) /* drops sb_lock */
666  return sb;
667  else
668  goto restart;
669  }
670  }
671  spin_unlock(&sb_lock);
672  return NULL;
673 }
674 
676 {
677  struct super_block *sb;
678 
679  spin_lock(&sb_lock);
680 rescan:
682  if (hlist_unhashed(&sb->s_instances))
683  continue;
684  if (sb->s_dev == dev) {
685  sb->s_count++;
686  spin_unlock(&sb_lock);
687  down_read(&sb->s_umount);
688  /* still alive? */
689  if (sb->s_root && (sb->s_flags & MS_BORN))
690  return sb;
691  up_read(&sb->s_umount);
692  /* nope, got unmounted */
693  spin_lock(&sb_lock);
694  __put_super(sb);
695  goto rescan;
696  }
697  }
698  spin_unlock(&sb_lock);
699  return NULL;
700 }
701 
711 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
712 {
713  int retval;
714  int remount_ro;
715 
716  if (sb->s_writers.frozen != SB_UNFROZEN)
717  return -EBUSY;
718 
719 #ifdef CONFIG_BLOCK
720  if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
721  return -EACCES;
722 #endif
723 
724  if (flags & MS_RDONLY)
725  acct_auto_close(sb);
726  shrink_dcache_sb(sb);
727  sync_filesystem(sb);
728 
729  remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
730 
731  /* If we are remounting RDONLY and current sb is read/write,
732  make sure there are no rw files opened */
733  if (remount_ro) {
734  if (force) {
735  mark_files_ro(sb);
736  } else {
737  retval = sb_prepare_remount_readonly(sb);
738  if (retval)
739  return retval;
740  }
741  }
742 
743  if (sb->s_op->remount_fs) {
744  retval = sb->s_op->remount_fs(sb, &flags, data);
745  if (retval) {
746  if (!force)
747  goto cancel_readonly;
748  /* If forced remount, go ahead despite any errors */
749  WARN(1, "forced remount of a %s fs returned %i\n",
750  sb->s_type->name, retval);
751  }
752  }
753  sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
754  /* Needs to be ordered wrt mnt_is_readonly() */
755  smp_wmb();
756  sb->s_readonly_remount = 0;
757 
758  /*
759  * Some filesystems modify their metadata via some other path than the
760  * bdev buffer cache (eg. use a private mapping, or directories in
761  * pagecache, etc). Also file data modifications go via their own
762  * mappings. So If we try to mount readonly then copy the filesystem
763  * from bdev, we could get stale data, so invalidate it to give a best
764  * effort at coherency.
765  */
766  if (remount_ro && sb->s_bdev)
767  invalidate_bdev(sb->s_bdev);
768  return 0;
769 
770 cancel_readonly:
771  sb->s_readonly_remount = 0;
772  return retval;
773 }
774 
775 static void do_emergency_remount(struct work_struct *work)
776 {
777  struct super_block *sb, *p = NULL;
778 
779  spin_lock(&sb_lock);
780  list_for_each_entry(sb, &super_blocks, s_list) {
781  if (hlist_unhashed(&sb->s_instances))
782  continue;
783  sb->s_count++;
784  spin_unlock(&sb_lock);
785  down_write(&sb->s_umount);
786  if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
787  !(sb->s_flags & MS_RDONLY)) {
788  /*
789  * What lock protects sb->s_flags??
790  */
791  do_remount_sb(sb, MS_RDONLY, NULL, 1);
792  }
793  up_write(&sb->s_umount);
794  spin_lock(&sb_lock);
795  if (p)
796  __put_super(p);
797  p = sb;
798  }
799  if (p)
800  __put_super(p);
801  spin_unlock(&sb_lock);
802  kfree(work);
803  printk("Emergency Remount complete\n");
804 }
805 
807 {
808  struct work_struct *work;
809 
810  work = kmalloc(sizeof(*work), GFP_ATOMIC);
811  if (work) {
812  INIT_WORK(work, do_emergency_remount);
813  schedule_work(work);
814  }
815 }
816 
817 /*
818  * Unnamed block devices are dummy devices used by virtual
819  * filesystems which don't use real block-devices. -- jrs
820  */
821 
822 static DEFINE_IDA(unnamed_dev_ida);
823 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
824 static int unnamed_dev_start = 0; /* don't bother trying below it */
825 
827 {
828  int dev;
829  int error;
830 
831  retry:
832  if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
833  return -ENOMEM;
834  spin_lock(&unnamed_dev_lock);
835  error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
836  if (!error)
837  unnamed_dev_start = dev + 1;
838  spin_unlock(&unnamed_dev_lock);
839  if (error == -EAGAIN)
840  /* We raced and lost with another CPU. */
841  goto retry;
842  else if (error)
843  return -EAGAIN;
844 
845  if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
846  spin_lock(&unnamed_dev_lock);
847  ida_remove(&unnamed_dev_ida, dev);
848  if (unnamed_dev_start > dev)
849  unnamed_dev_start = dev;
850  spin_unlock(&unnamed_dev_lock);
851  return -EMFILE;
852  }
853  *p = MKDEV(0, dev & MINORMASK);
854  return 0;
855 }
857 
859 {
860  int slot = MINOR(dev);
861  spin_lock(&unnamed_dev_lock);
862  ida_remove(&unnamed_dev_ida, slot);
863  if (slot < unnamed_dev_start)
864  unnamed_dev_start = slot;
865  spin_unlock(&unnamed_dev_lock);
866 }
868 
869 int set_anon_super(struct super_block *s, void *data)
870 {
871  int error = get_anon_bdev(&s->s_dev);
872  if (!error)
874  return error;
875 }
876 
878 
879 void kill_anon_super(struct super_block *sb)
880 {
881  dev_t dev = sb->s_dev;
883  free_anon_bdev(dev);
884 }
885 
887 
889 {
890  if (sb->s_root)
891  d_genocide(sb->s_root);
892  kill_anon_super(sb);
893 }
894 
896 
897 static int ns_test_super(struct super_block *sb, void *data)
898 {
899  return sb->s_fs_info == data;
900 }
901 
902 static int ns_set_super(struct super_block *sb, void *data)
903 {
904  sb->s_fs_info = data;
905  return set_anon_super(sb, NULL);
906 }
907 
908 struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
909  void *data, int (*fill_super)(struct super_block *, void *, int))
910 {
911  struct super_block *sb;
912 
913  sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
914  if (IS_ERR(sb))
915  return ERR_CAST(sb);
916 
917  if (!sb->s_root) {
918  int err;
919  err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
920  if (err) {
922  return ERR_PTR(err);
923  }
924 
925  sb->s_flags |= MS_ACTIVE;
926  }
927 
928  return dget(sb->s_root);
929 }
930 
932 
933 #ifdef CONFIG_BLOCK
934 static int set_bdev_super(struct super_block *s, void *data)
935 {
936  s->s_bdev = data;
937  s->s_dev = s->s_bdev->bd_dev;
938 
939  /*
940  * We set the bdi here to the queue backing, file systems can
941  * overwrite this in ->fill_super()
942  */
943  s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
944  return 0;
945 }
946 
947 static int test_bdev_super(struct super_block *s, void *data)
948 {
949  return (void *)s->s_bdev == data;
950 }
951 
952 struct dentry *mount_bdev(struct file_system_type *fs_type,
953  int flags, const char *dev_name, void *data,
954  int (*fill_super)(struct super_block *, void *, int))
955 {
956  struct block_device *bdev;
957  struct super_block *s;
959  int error = 0;
960 
961  if (!(flags & MS_RDONLY))
962  mode |= FMODE_WRITE;
963 
964  bdev = blkdev_get_by_path(dev_name, mode, fs_type);
965  if (IS_ERR(bdev))
966  return ERR_CAST(bdev);
967 
968  /*
969  * once the super is inserted into the list by sget, s_umount
970  * will protect the lockfs code from trying to start a snapshot
971  * while we are mounting
972  */
974  if (bdev->bd_fsfreeze_count > 0) {
976  error = -EBUSY;
977  goto error_bdev;
978  }
979  s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
980  bdev);
982  if (IS_ERR(s))
983  goto error_s;
984 
985  if (s->s_root) {
986  if ((flags ^ s->s_flags) & MS_RDONLY) {
988  error = -EBUSY;
989  goto error_bdev;
990  }
991 
992  /*
993  * s_umount nests inside bd_mutex during
994  * __invalidate_device(). blkdev_put() acquires
995  * bd_mutex and can't be called under s_umount. Drop
996  * s_umount temporarily. This is safe as we're
997  * holding an active reference.
998  */
999  up_write(&s->s_umount);
1000  blkdev_put(bdev, mode);
1001  down_write(&s->s_umount);
1002  } else {
1003  char b[BDEVNAME_SIZE];
1004 
1005  s->s_mode = mode;
1006  strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1007  sb_set_blocksize(s, block_size(bdev));
1008  error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1009  if (error) {
1011  goto error;
1012  }
1013 
1014  s->s_flags |= MS_ACTIVE;
1015  bdev->bd_super = s;
1016  }
1017 
1018  return dget(s->s_root);
1019 
1020 error_s:
1021  error = PTR_ERR(s);
1022 error_bdev:
1023  blkdev_put(bdev, mode);
1024 error:
1025  return ERR_PTR(error);
1026 }
1028 
1029 void kill_block_super(struct super_block *sb)
1030 {
1031  struct block_device *bdev = sb->s_bdev;
1032  fmode_t mode = sb->s_mode;
1033 
1034  bdev->bd_super = NULL;
1036  sync_blockdev(bdev);
1037  WARN_ON_ONCE(!(mode & FMODE_EXCL));
1038  blkdev_put(bdev, mode | FMODE_EXCL);
1039 }
1040 
1042 #endif
1043 
1044 struct dentry *mount_nodev(struct file_system_type *fs_type,
1045  int flags, void *data,
1046  int (*fill_super)(struct super_block *, void *, int))
1047 {
1048  int error;
1049  struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
1050 
1051  if (IS_ERR(s))
1052  return ERR_CAST(s);
1053 
1054  error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1055  if (error) {
1057  return ERR_PTR(error);
1058  }
1059  s->s_flags |= MS_ACTIVE;
1060  return dget(s->s_root);
1061 }
1063 
1064 static int compare_single(struct super_block *s, void *p)
1065 {
1066  return 1;
1067 }
1068 
1069 struct dentry *mount_single(struct file_system_type *fs_type,
1070  int flags, void *data,
1071  int (*fill_super)(struct super_block *, void *, int))
1072 {
1073  struct super_block *s;
1074  int error;
1075 
1076  s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
1077  if (IS_ERR(s))
1078  return ERR_CAST(s);
1079  if (!s->s_root) {
1080  error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1081  if (error) {
1083  return ERR_PTR(error);
1084  }
1085  s->s_flags |= MS_ACTIVE;
1086  } else {
1087  do_remount_sb(s, flags, data, 0);
1088  }
1089  return dget(s->s_root);
1090 }
1092 
1093 struct dentry *
1094 mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1095 {
1096  struct dentry *root;
1097  struct super_block *sb;
1098  char *secdata = NULL;
1099  int error = -ENOMEM;
1100 
1101  if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
1102  secdata = alloc_secdata();
1103  if (!secdata)
1104  goto out;
1105 
1106  error = security_sb_copy_data(data, secdata);
1107  if (error)
1108  goto out_free_secdata;
1109  }
1110 
1111  root = type->mount(type, flags, name, data);
1112  if (IS_ERR(root)) {
1113  error = PTR_ERR(root);
1114  goto out_free_secdata;
1115  }
1116  sb = root->d_sb;
1117  BUG_ON(!sb);
1118  WARN_ON(!sb->s_bdi);
1120  sb->s_flags |= MS_BORN;
1121 
1122  error = security_sb_kern_mount(sb, flags, secdata);
1123  if (error)
1124  goto out_sb;
1125 
1126  /*
1127  * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
1128  * but s_maxbytes was an unsigned long long for many releases. Throw
1129  * this warning for a little while to try and catch filesystems that
1130  * violate this rule.
1131  */
1132  WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1133  "negative value (%lld)\n", type->name, sb->s_maxbytes);
1134 
1135  up_write(&sb->s_umount);
1136  free_secdata(secdata);
1137  return root;
1138 out_sb:
1139  dput(root);
1141 out_free_secdata:
1142  free_secdata(secdata);
1143 out:
1144  return ERR_PTR(error);
1145 }
1146 
1147 /*
1148  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1149  * instead.
1150  */
1151 void __sb_end_write(struct super_block *sb, int level)
1152 {
1153  percpu_counter_dec(&sb->s_writers.counter[level-1]);
1154  /*
1155  * Make sure s_writers are updated before we wake up waiters in
1156  * freeze_super().
1157  */
1158  smp_mb();
1159  if (waitqueue_active(&sb->s_writers.wait))
1160  wake_up(&sb->s_writers.wait);
1161  rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1162 }
1164 
1165 #ifdef CONFIG_LOCKDEP
1166 /*
1167  * We want lockdep to tell us about possible deadlocks with freezing but
1168  * it's it bit tricky to properly instrument it. Getting a freeze protection
1169  * works as getting a read lock but there are subtle problems. XFS for example
1170  * gets freeze protection on internal level twice in some cases, which is OK
1171  * only because we already hold a freeze protection also on higher level. Due
1172  * to these cases we have to tell lockdep we are doing trylock when we
1173  * already hold a freeze protection for a higher freeze level.
1174  */
1175 static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1176  unsigned long ip)
1177 {
1178  int i;
1179 
1180  if (!trylock) {
1181  for (i = 0; i < level - 1; i++)
1182  if (lock_is_held(&sb->s_writers.lock_map[i])) {
1183  trylock = true;
1184  break;
1185  }
1186  }
1187  rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1188 }
1189 #endif
1190 
1191 /*
1192  * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1193  * instead.
1194  */
1195 int __sb_start_write(struct super_block *sb, int level, bool wait)
1196 {
1197 retry:
1198  if (unlikely(sb->s_writers.frozen >= level)) {
1199  if (!wait)
1200  return 0;
1201  wait_event(sb->s_writers.wait_unfrozen,
1202  sb->s_writers.frozen < level);
1203  }
1204 
1205 #ifdef CONFIG_LOCKDEP
1206  acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1207 #endif
1208  percpu_counter_inc(&sb->s_writers.counter[level-1]);
1209  /*
1210  * Make sure counter is updated before we check for frozen.
1211  * freeze_super() first sets frozen and then checks the counter.
1212  */
1213  smp_mb();
1214  if (unlikely(sb->s_writers.frozen >= level)) {
1215  __sb_end_write(sb, level);
1216  goto retry;
1217  }
1218  return 1;
1219 }
1221 
1232 static void sb_wait_write(struct super_block *sb, int level)
1233 {
1234  s64 writers;
1235 
1236  /*
1237  * We just cycle-through lockdep here so that it does not complain
1238  * about returning with lock to userspace
1239  */
1240  rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1241  rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
1242 
1243  do {
1244  DEFINE_WAIT(wait);
1245 
1246  /*
1247  * We use a barrier in prepare_to_wait() to separate setting
1248  * of frozen and checking of the counter
1249  */
1250  prepare_to_wait(&sb->s_writers.wait, &wait,
1252 
1253  writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
1254  if (writers)
1255  schedule();
1256 
1257  finish_wait(&sb->s_writers.wait, &wait);
1258  } while (writers);
1259 }
1260 
1294 int freeze_super(struct super_block *sb)
1295 {
1296  int ret;
1297 
1298  atomic_inc(&sb->s_active);
1299  down_write(&sb->s_umount);
1300  if (sb->s_writers.frozen != SB_UNFROZEN) {
1302  return -EBUSY;
1303  }
1304 
1305  if (!(sb->s_flags & MS_BORN)) {
1306  up_write(&sb->s_umount);
1307  return 0; /* sic - it's "nothing to do" */
1308  }
1309 
1310  if (sb->s_flags & MS_RDONLY) {
1311  /* Nothing to do really... */
1312  sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1313  up_write(&sb->s_umount);
1314  return 0;
1315  }
1316 
1317  /* From now on, no new normal writers can start */
1318  sb->s_writers.frozen = SB_FREEZE_WRITE;
1319  smp_wmb();
1320 
1321  /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1322  up_write(&sb->s_umount);
1323 
1324  sb_wait_write(sb, SB_FREEZE_WRITE);
1325 
1326  /* Now we go and block page faults... */
1327  down_write(&sb->s_umount);
1328  sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1329  smp_wmb();
1330 
1331  sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1332 
1333  /* All writers are done so after syncing there won't be dirty data */
1334  sync_filesystem(sb);
1335 
1336  /* Now wait for internal filesystem counter */
1337  sb->s_writers.frozen = SB_FREEZE_FS;
1338  smp_wmb();
1339  sb_wait_write(sb, SB_FREEZE_FS);
1340 
1341  if (sb->s_op->freeze_fs) {
1342  ret = sb->s_op->freeze_fs(sb);
1343  if (ret) {
1345  "VFS:Filesystem freeze failed\n");
1346  sb->s_writers.frozen = SB_UNFROZEN;
1347  smp_wmb();
1348  wake_up(&sb->s_writers.wait_unfrozen);
1350  return ret;
1351  }
1352  }
1353  /*
1354  * This is just for debugging purposes so that fs can warn if it
1355  * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
1356  */
1357  sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1358  up_write(&sb->s_umount);
1359  return 0;
1360 }
1362 
1369 int thaw_super(struct super_block *sb)
1370 {
1371  int error;
1372 
1373  down_write(&sb->s_umount);
1374  if (sb->s_writers.frozen == SB_UNFROZEN) {
1375  up_write(&sb->s_umount);
1376  return -EINVAL;
1377  }
1378 
1379  if (sb->s_flags & MS_RDONLY)
1380  goto out;
1381 
1382  if (sb->s_op->unfreeze_fs) {
1383  error = sb->s_op->unfreeze_fs(sb);
1384  if (error) {
1386  "VFS:Filesystem thaw failed\n");
1387  up_write(&sb->s_umount);
1388  return error;
1389  }
1390  }
1391 
1392 out:
1393  sb->s_writers.frozen = SB_UNFROZEN;
1394  smp_wmb();
1395  wake_up(&sb->s_writers.wait_unfrozen);
1397 
1398  return 0;
1399 }