Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
block_dev.c
Go to the documentation of this file.
1 /*
2  * linux/fs/block_dev.c
3  *
4  * Copyright (C) 1991, 1992 Linus Torvalds
5  * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
6  */
7 
8 #include <linux/init.h>
9 #include <linux/mm.h>
10 #include <linux/fcntl.h>
11 #include <linux/slab.h>
12 #include <linux/kmod.h>
13 #include <linux/major.h>
14 #include <linux/device_cgroup.h>
15 #include <linux/highmem.h>
16 #include <linux/blkdev.h>
17 #include <linux/module.h>
18 #include <linux/blkpg.h>
19 #include <linux/magic.h>
20 #include <linux/buffer_head.h>
21 #include <linux/swap.h>
22 #include <linux/pagevec.h>
23 #include <linux/writeback.h>
24 #include <linux/mpage.h>
25 #include <linux/mount.h>
26 #include <linux/uio.h>
27 #include <linux/namei.h>
28 #include <linux/log2.h>
29 #include <linux/cleancache.h>
30 #include <asm/uaccess.h>
31 #include "internal.h"
32 
33 struct bdev_inode {
35  struct inode vfs_inode;
36 };
37 
38 static const struct address_space_operations def_blk_aops;
39 
40 static inline struct bdev_inode *BDEV_I(struct inode *inode)
41 {
42  return container_of(inode, struct bdev_inode, vfs_inode);
43 }
44 
45 inline struct block_device *I_BDEV(struct inode *inode)
46 {
47  return &BDEV_I(inode)->bdev;
48 }
50 
51 /*
52  * Move the inode from its current bdi to a new bdi. If the inode is dirty we
53  * need to move it onto the dirty list of @dst so that the inode is always on
54  * the right list.
55  */
56 static void bdev_inode_switch_bdi(struct inode *inode,
57  struct backing_dev_info *dst)
58 {
59  struct backing_dev_info *old = inode->i_data.backing_dev_info;
60 
61  if (unlikely(dst == old)) /* deadlock avoidance */
62  return;
63  bdi_lock_two(&old->wb, &dst->wb);
64  spin_lock(&inode->i_lock);
65  inode->i_data.backing_dev_info = dst;
66  if (inode->i_state & I_DIRTY)
67  list_move(&inode->i_wb_list, &dst->wb.b_dirty);
68  spin_unlock(&inode->i_lock);
69  spin_unlock(&old->wb.list_lock);
70  spin_unlock(&dst->wb.list_lock);
71 }
72 
73 /* Kill _all_ buffers and pagecache , dirty or not.. */
74 void kill_bdev(struct block_device *bdev)
75 {
76  struct address_space *mapping = bdev->bd_inode->i_mapping;
77 
78  if (mapping->nrpages == 0)
79  return;
80 
82  truncate_inode_pages(mapping, 0);
83 }
85 
86 /* Invalidate clean unused buffers and pagecache. */
87 void invalidate_bdev(struct block_device *bdev)
88 {
89  struct address_space *mapping = bdev->bd_inode->i_mapping;
90 
91  if (mapping->nrpages == 0)
92  return;
93 
95  lru_add_drain_all(); /* make sure all lru add caches are flushed */
96  invalidate_mapping_pages(mapping, 0, -1);
97  /* 99% of the time, we don't need to flush the cleancache on the bdev.
98  * But, for the strange corners, lets be cautious
99  */
100  cleancache_invalidate_inode(mapping);
101 }
103 
104 int set_blocksize(struct block_device *bdev, int size)
105 {
106  /* Size must be a power of two, and between 512 and PAGE_SIZE */
107  if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
108  return -EINVAL;
109 
110  /* Size cannot be smaller than the size supported by the device */
111  if (size < bdev_logical_block_size(bdev))
112  return -EINVAL;
113 
114  /* Don't change the size if it is same as current */
115  if (bdev->bd_block_size != size) {
116  sync_blockdev(bdev);
117  bdev->bd_block_size = size;
118  bdev->bd_inode->i_blkbits = blksize_bits(size);
119  kill_bdev(bdev);
120  }
121  return 0;
122 }
123 
125 
127 {
128  if (set_blocksize(sb->s_bdev, size))
129  return 0;
130  /* If we get here, we know size is power of two
131  * and it's value is between 512 and PAGE_SIZE */
132  sb->s_blocksize = size;
133  sb->s_blocksize_bits = blksize_bits(size);
134  return sb->s_blocksize;
135 }
136 
138 
140 {
141  int minsize = bdev_logical_block_size(sb->s_bdev);
142  if (size < minsize)
143  size = minsize;
144  return sb_set_blocksize(sb, size);
145 }
146 
148 
149 static int
150 blkdev_get_block(struct inode *inode, sector_t iblock,
151  struct buffer_head *bh, int create)
152 {
153  bh->b_bdev = I_BDEV(inode);
154  bh->b_blocknr = iblock;
155  set_buffer_mapped(bh);
156  return 0;
157 }
158 
159 static ssize_t
160 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
161  loff_t offset, unsigned long nr_segs)
162 {
163  struct file *file = iocb->ki_filp;
164  struct inode *inode = file->f_mapping->host;
165 
166  return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
167  nr_segs, blkdev_get_block, NULL, NULL, 0);
168 }
169 
170 int __sync_blockdev(struct block_device *bdev, int wait)
171 {
172  if (!bdev)
173  return 0;
174  if (!wait)
175  return filemap_flush(bdev->bd_inode->i_mapping);
176  return filemap_write_and_wait(bdev->bd_inode->i_mapping);
177 }
178 
179 /*
180  * Write out and wait upon all the dirty data associated with a block
181  * device via its mapping. Does not take the superblock lock.
182  */
183 int sync_blockdev(struct block_device *bdev)
184 {
185  return __sync_blockdev(bdev, 1);
186 }
188 
189 /*
190  * Write out and wait upon all dirty data associated with this
191  * device. Filesystem data as well as the underlying block
192  * device. Takes the superblock lock.
193  */
194 int fsync_bdev(struct block_device *bdev)
195 {
196  struct super_block *sb = get_super(bdev);
197  if (sb) {
198  int res = sync_filesystem(sb);
199  drop_super(sb);
200  return res;
201  }
202  return sync_blockdev(bdev);
203 }
205 
218 struct super_block *freeze_bdev(struct block_device *bdev)
219 {
220  struct super_block *sb;
221  int error = 0;
222 
224  if (++bdev->bd_fsfreeze_count > 1) {
225  /*
226  * We don't even need to grab a reference - the first call
227  * to freeze_bdev grab an active reference and only the last
228  * thaw_bdev drops it.
229  */
230  sb = get_super(bdev);
231  drop_super(sb);
233  return sb;
234  }
235 
236  sb = get_active_super(bdev);
237  if (!sb)
238  goto out;
239  error = freeze_super(sb);
240  if (error) {
241  deactivate_super(sb);
242  bdev->bd_fsfreeze_count--;
244  return ERR_PTR(error);
245  }
246  deactivate_super(sb);
247  out:
248  sync_blockdev(bdev);
250  return sb; /* thaw_bdev releases s->s_umount */
251 }
253 
261 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
262 {
263  int error = -EINVAL;
264 
266  if (!bdev->bd_fsfreeze_count)
267  goto out;
268 
269  error = 0;
270  if (--bdev->bd_fsfreeze_count > 0)
271  goto out;
272 
273  if (!sb)
274  goto out;
275 
276  error = thaw_super(sb);
277  if (error) {
278  bdev->bd_fsfreeze_count++;
280  return error;
281  }
282 out:
284  return 0;
285 }
287 
288 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
289 {
290  return block_write_full_page(page, blkdev_get_block, wbc);
291 }
292 
293 static int blkdev_readpage(struct file * file, struct page * page)
294 {
295  return block_read_full_page(page, blkdev_get_block);
296 }
297 
298 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
299  loff_t pos, unsigned len, unsigned flags,
300  struct page **pagep, void **fsdata)
301 {
302  return block_write_begin(mapping, pos, len, flags, pagep,
303  blkdev_get_block);
304 }
305 
306 static int blkdev_write_end(struct file *file, struct address_space *mapping,
307  loff_t pos, unsigned len, unsigned copied,
308  struct page *page, void *fsdata)
309 {
310  int ret;
311  ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
312 
313  unlock_page(page);
314  page_cache_release(page);
315 
316  return ret;
317 }
318 
319 /*
320  * private llseek:
321  * for a block special file file->f_path.dentry->d_inode->i_size is zero
322  * so we compute the size by hand (just as in block_read/write above)
323  */
324 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
325 {
326  struct inode *bd_inode = file->f_mapping->host;
327  loff_t size;
328  loff_t retval;
329 
330  mutex_lock(&bd_inode->i_mutex);
331  size = i_size_read(bd_inode);
332 
333  retval = -EINVAL;
334  switch (origin) {
335  case SEEK_END:
336  offset += size;
337  break;
338  case SEEK_CUR:
339  offset += file->f_pos;
340  case SEEK_SET:
341  break;
342  default:
343  goto out;
344  }
345  if (offset >= 0 && offset <= size) {
346  if (offset != file->f_pos) {
347  file->f_pos = offset;
348  }
349  retval = offset;
350  }
351 out:
352  mutex_unlock(&bd_inode->i_mutex);
353  return retval;
354 }
355 
356 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
357 {
358  struct inode *bd_inode = filp->f_mapping->host;
359  struct block_device *bdev = I_BDEV(bd_inode);
360  int error;
361 
362  error = filemap_write_and_wait_range(filp->f_mapping, start, end);
363  if (error)
364  return error;
365 
366  /*
367  * There is no need to serialise calls to blkdev_issue_flush with
368  * i_mutex and doing so causes performance issues with concurrent
369  * O_SYNC writers to a block device.
370  */
371  error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
372  if (error == -EOPNOTSUPP)
373  error = 0;
374 
375  return error;
376 }
378 
379 /*
380  * pseudo-fs
381  */
382 
384 static struct kmem_cache * bdev_cachep __read_mostly;
385 
386 static struct inode *bdev_alloc_inode(struct super_block *sb)
387 {
388  struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
389  if (!ei)
390  return NULL;
391  return &ei->vfs_inode;
392 }
393 
394 static void bdev_i_callback(struct rcu_head *head)
395 {
396  struct inode *inode = container_of(head, struct inode, i_rcu);
397  struct bdev_inode *bdi = BDEV_I(inode);
398 
399  kmem_cache_free(bdev_cachep, bdi);
400 }
401 
402 static void bdev_destroy_inode(struct inode *inode)
403 {
404  call_rcu(&inode->i_rcu, bdev_i_callback);
405 }
406 
407 static void init_once(void *foo)
408 {
409  struct bdev_inode *ei = (struct bdev_inode *) foo;
410  struct block_device *bdev = &ei->bdev;
411 
412  memset(bdev, 0, sizeof(*bdev));
413  mutex_init(&bdev->bd_mutex);
414  INIT_LIST_HEAD(&bdev->bd_inodes);
415  INIT_LIST_HEAD(&bdev->bd_list);
416 #ifdef CONFIG_SYSFS
417  INIT_LIST_HEAD(&bdev->bd_holder_disks);
418 #endif
420  /* Initialize mutex for freeze. */
422 }
423 
424 static inline void __bd_forget(struct inode *inode)
425 {
426  list_del_init(&inode->i_devices);
427  inode->i_bdev = NULL;
428  inode->i_mapping = &inode->i_data;
429 }
430 
431 static void bdev_evict_inode(struct inode *inode)
432 {
433  struct block_device *bdev = &BDEV_I(inode)->bdev;
434  struct list_head *p;
435  truncate_inode_pages(&inode->i_data, 0);
436  invalidate_inode_buffers(inode); /* is it needed here? */
437  clear_inode(inode);
438  spin_lock(&bdev_lock);
439  while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
440  __bd_forget(list_entry(p, struct inode, i_devices));
441  }
442  list_del_init(&bdev->bd_list);
443  spin_unlock(&bdev_lock);
444 }
445 
446 static const struct super_operations bdev_sops = {
447  .statfs = simple_statfs,
448  .alloc_inode = bdev_alloc_inode,
449  .destroy_inode = bdev_destroy_inode,
450  .drop_inode = generic_delete_inode,
451  .evict_inode = bdev_evict_inode,
452 };
453 
454 static struct dentry *bd_mount(struct file_system_type *fs_type,
455  int flags, const char *dev_name, void *data)
456 {
457  return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
458 }
459 
460 static struct file_system_type bd_type = {
461  .name = "bdev",
462  .mount = bd_mount,
463  .kill_sb = kill_anon_super,
464 };
465 
466 static struct super_block *blockdev_superblock __read_mostly;
467 
469 {
470  int err;
471  static struct vfsmount *bd_mnt;
472 
473  bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
476  init_once);
477  err = register_filesystem(&bd_type);
478  if (err)
479  panic("Cannot register bdev pseudo-fs");
480  bd_mnt = kern_mount(&bd_type);
481  if (IS_ERR(bd_mnt))
482  panic("Cannot create bdev pseudo-fs");
483  blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
484 }
485 
486 /*
487  * Most likely _very_ bad one - but then it's hardly critical for small
488  * /dev and can be fixed when somebody will need really large one.
489  * Keep in mind that it will be fed through icache hash function too.
490  */
491 static inline unsigned long hash(dev_t dev)
492 {
493  return MAJOR(dev)+MINOR(dev);
494 }
495 
496 static int bdev_test(struct inode *inode, void *data)
497 {
498  return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
499 }
500 
501 static int bdev_set(struct inode *inode, void *data)
502 {
503  BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
504  return 0;
505 }
506 
507 static LIST_HEAD(all_bdevs);
508 
510 {
511  struct block_device *bdev;
512  struct inode *inode;
513 
514  inode = iget5_locked(blockdev_superblock, hash(dev),
515  bdev_test, bdev_set, &dev);
516 
517  if (!inode)
518  return NULL;
519 
520  bdev = &BDEV_I(inode)->bdev;
521 
522  if (inode->i_state & I_NEW) {
523  bdev->bd_contains = NULL;
524  bdev->bd_super = NULL;
525  bdev->bd_inode = inode;
526  bdev->bd_block_size = (1 << inode->i_blkbits);
527  bdev->bd_part_count = 0;
528  bdev->bd_invalidated = 0;
529  inode->i_mode = S_IFBLK;
530  inode->i_rdev = dev;
531  inode->i_bdev = bdev;
532  inode->i_data.a_ops = &def_blk_aops;
533  mapping_set_gfp_mask(&inode->i_data, GFP_USER);
534  inode->i_data.backing_dev_info = &default_backing_dev_info;
535  spin_lock(&bdev_lock);
536  list_add(&bdev->bd_list, &all_bdevs);
537  spin_unlock(&bdev_lock);
538  unlock_new_inode(inode);
539  }
540  return bdev;
541 }
542 
544 
549 struct block_device *bdgrab(struct block_device *bdev)
550 {
551  ihold(bdev->bd_inode);
552  return bdev;
553 }
554 
556 {
557  struct block_device *bdev;
558  long ret = 0;
559  spin_lock(&bdev_lock);
560  list_for_each_entry(bdev, &all_bdevs, bd_list) {
561  ret += bdev->bd_inode->i_mapping->nrpages;
562  }
563  spin_unlock(&bdev_lock);
564  return ret;
565 }
566 
567 void bdput(struct block_device *bdev)
568 {
569  iput(bdev->bd_inode);
570 }
571 
573 
574 static struct block_device *bd_acquire(struct inode *inode)
575 {
576  struct block_device *bdev;
577 
578  spin_lock(&bdev_lock);
579  bdev = inode->i_bdev;
580  if (bdev) {
581  ihold(bdev->bd_inode);
582  spin_unlock(&bdev_lock);
583  return bdev;
584  }
585  spin_unlock(&bdev_lock);
586 
587  bdev = bdget(inode->i_rdev);
588  if (bdev) {
589  spin_lock(&bdev_lock);
590  if (!inode->i_bdev) {
591  /*
592  * We take an additional reference to bd_inode,
593  * and it's released in clear_inode() of inode.
594  * So, we can access it via ->i_mapping always
595  * without igrab().
596  */
597  ihold(bdev->bd_inode);
598  inode->i_bdev = bdev;
599  inode->i_mapping = bdev->bd_inode->i_mapping;
600  list_add(&inode->i_devices, &bdev->bd_inodes);
601  }
602  spin_unlock(&bdev_lock);
603  }
604  return bdev;
605 }
606 
607 static inline int sb_is_blkdev_sb(struct super_block *sb)
608 {
609  return sb == blockdev_superblock;
610 }
611 
612 /* Call when you free inode */
613 
614 void bd_forget(struct inode *inode)
615 {
616  struct block_device *bdev = NULL;
617 
618  spin_lock(&bdev_lock);
619  if (inode->i_bdev) {
620  if (!sb_is_blkdev_sb(inode->i_sb))
621  bdev = inode->i_bdev;
622  __bd_forget(inode);
623  }
624  spin_unlock(&bdev_lock);
625 
626  if (bdev)
627  iput(bdev->bd_inode);
628 }
629 
644 static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
645  void *holder)
646 {
647  if (bdev->bd_holder == holder)
648  return true; /* already a holder */
649  else if (bdev->bd_holder != NULL)
650  return false; /* held by someone else */
651  else if (bdev->bd_contains == bdev)
652  return true; /* is a whole device which isn't held */
653 
654  else if (whole->bd_holder == bd_may_claim)
655  return true; /* is a partition of a device that is being partitioned */
656  else if (whole->bd_holder != NULL)
657  return false; /* is a partition of a held device */
658  else
659  return true; /* is a partition of an un-held device */
660 }
661 
680 static int bd_prepare_to_claim(struct block_device *bdev,
681  struct block_device *whole, void *holder)
682 {
683 retry:
684  /* if someone else claimed, fail */
685  if (!bd_may_claim(bdev, whole, holder))
686  return -EBUSY;
687 
688  /* if claiming is already in progress, wait for it to finish */
689  if (whole->bd_claiming) {
690  wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
691  DEFINE_WAIT(wait);
692 
694  spin_unlock(&bdev_lock);
695  schedule();
696  finish_wait(wq, &wait);
697  spin_lock(&bdev_lock);
698  goto retry;
699  }
700 
701  /* yay, all mine */
702  return 0;
703 }
704 
728 static struct block_device *bd_start_claiming(struct block_device *bdev,
729  void *holder)
730 {
731  struct gendisk *disk;
732  struct block_device *whole;
733  int partno, err;
734 
735  might_sleep();
736 
737  /*
738  * @bdev might not have been initialized properly yet, look up
739  * and grab the outer block device the hard way.
740  */
741  disk = get_gendisk(bdev->bd_dev, &partno);
742  if (!disk)
743  return ERR_PTR(-ENXIO);
744 
745  /*
746  * Normally, @bdev should equal what's returned from bdget_disk()
747  * if partno is 0; however, some drivers (floppy) use multiple
748  * bdev's for the same physical device and @bdev may be one of the
749  * aliases. Keep @bdev if partno is 0. This means claimer
750  * tracking is broken for those devices but it has always been that
751  * way.
752  */
753  if (partno)
754  whole = bdget_disk(disk, 0);
755  else
756  whole = bdgrab(bdev);
757 
758  module_put(disk->fops->owner);
759  put_disk(disk);
760  if (!whole)
761  return ERR_PTR(-ENOMEM);
762 
763  /* prepare to claim, if successful, mark claiming in progress */
764  spin_lock(&bdev_lock);
765 
766  err = bd_prepare_to_claim(bdev, whole, holder);
767  if (err == 0) {
768  whole->bd_claiming = holder;
769  spin_unlock(&bdev_lock);
770  return whole;
771  } else {
772  spin_unlock(&bdev_lock);
773  bdput(whole);
774  return ERR_PTR(err);
775  }
776 }
777 
778 #ifdef CONFIG_SYSFS
779 struct bd_holder_disk {
780  struct list_head list;
781  struct gendisk *disk;
782  int refcnt;
783 };
784 
785 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
786  struct gendisk *disk)
787 {
788  struct bd_holder_disk *holder;
789 
790  list_for_each_entry(holder, &bdev->bd_holder_disks, list)
791  if (holder->disk == disk)
792  return holder;
793  return NULL;
794 }
795 
796 static int add_symlink(struct kobject *from, struct kobject *to)
797 {
798  return sysfs_create_link(from, to, kobject_name(to));
799 }
800 
801 static void del_symlink(struct kobject *from, struct kobject *to)
802 {
803  sysfs_remove_link(from, kobject_name(to));
804 }
805 
834 int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
835 {
836  struct bd_holder_disk *holder;
837  int ret = 0;
838 
839  mutex_lock(&bdev->bd_mutex);
840 
841  WARN_ON_ONCE(!bdev->bd_holder);
842 
843  /* FIXME: remove the following once add_disk() handles errors */
844  if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
845  goto out_unlock;
846 
847  holder = bd_find_holder_disk(bdev, disk);
848  if (holder) {
849  holder->refcnt++;
850  goto out_unlock;
851  }
852 
853  holder = kzalloc(sizeof(*holder), GFP_KERNEL);
854  if (!holder) {
855  ret = -ENOMEM;
856  goto out_unlock;
857  }
858 
859  INIT_LIST_HEAD(&holder->list);
860  holder->disk = disk;
861  holder->refcnt = 1;
862 
863  ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
864  if (ret)
865  goto out_free;
866 
867  ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
868  if (ret)
869  goto out_del;
870  /*
871  * bdev could be deleted beneath us which would implicitly destroy
872  * the holder directory. Hold on to it.
873  */
874  kobject_get(bdev->bd_part->holder_dir);
875 
876  list_add(&holder->list, &bdev->bd_holder_disks);
877  goto out_unlock;
878 
879 out_del:
880  del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
881 out_free:
882  kfree(holder);
883 out_unlock:
884  mutex_unlock(&bdev->bd_mutex);
885  return ret;
886 }
887 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
888 
899 void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
900 {
901  struct bd_holder_disk *holder;
902 
903  mutex_lock(&bdev->bd_mutex);
904 
905  holder = bd_find_holder_disk(bdev, disk);
906 
907  if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
908  del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
909  del_symlink(bdev->bd_part->holder_dir,
910  &disk_to_dev(disk)->kobj);
911  kobject_put(bdev->bd_part->holder_dir);
912  list_del_init(&holder->list);
913  kfree(holder);
914  }
915 
916  mutex_unlock(&bdev->bd_mutex);
917 }
918 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
919 #endif
920 
931 static void flush_disk(struct block_device *bdev, bool kill_dirty)
932 {
933  if (__invalidate_device(bdev, kill_dirty)) {
934  char name[BDEVNAME_SIZE] = "";
935 
936  if (bdev->bd_disk)
937  disk_name(bdev->bd_disk, 0, name);
938  printk(KERN_WARNING "VFS: busy inodes on changed media or "
939  "resized disk %s\n", name);
940  }
941 
942  if (!bdev->bd_disk)
943  return;
944  if (disk_part_scan_enabled(bdev->bd_disk))
945  bdev->bd_invalidated = 1;
946 }
947 
956 void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
957 {
958  loff_t disk_size, bdev_size;
959 
960  disk_size = (loff_t)get_capacity(disk) << 9;
961  bdev_size = i_size_read(bdev->bd_inode);
962  if (disk_size != bdev_size) {
963  char name[BDEVNAME_SIZE];
964 
965  disk_name(disk, 0, name);
967  "%s: detected capacity change from %lld to %lld\n",
968  name, bdev_size, disk_size);
969  i_size_write(bdev->bd_inode, disk_size);
970  flush_disk(bdev, false);
971  }
972 }
974 
983 int revalidate_disk(struct gendisk *disk)
984 {
985  struct block_device *bdev;
986  int ret = 0;
987 
988  if (disk->fops->revalidate_disk)
989  ret = disk->fops->revalidate_disk(disk);
990 
991  bdev = bdget_disk(disk, 0);
992  if (!bdev)
993  return ret;
994 
995  mutex_lock(&bdev->bd_mutex);
996  check_disk_size_change(disk, bdev);
997  mutex_unlock(&bdev->bd_mutex);
998  bdput(bdev);
999  return ret;
1000 }
1002 
1003 /*
1004  * This routine checks whether a removable media has been changed,
1005  * and invalidates all buffer-cache-entries in that case. This
1006  * is a relatively slow routine, so we have to try to minimize using
1007  * it. Thus it is called only upon a 'mount' or 'open'. This
1008  * is the best way of combining speed and utility, I think.
1009  * People changing diskettes in the middle of an operation deserve
1010  * to lose :-)
1011  */
1013 {
1014  struct gendisk *disk = bdev->bd_disk;
1015  const struct block_device_operations *bdops = disk->fops;
1016  unsigned int events;
1017 
1018  events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1019  DISK_EVENT_EJECT_REQUEST);
1020  if (!(events & DISK_EVENT_MEDIA_CHANGE))
1021  return 0;
1022 
1023  flush_disk(bdev, true);
1024  if (bdops->revalidate_disk)
1025  bdops->revalidate_disk(bdev->bd_disk);
1026  return 1;
1027 }
1028 
1030 
1031 void bd_set_size(struct block_device *bdev, loff_t size)
1032 {
1033  unsigned bsize = bdev_logical_block_size(bdev);
1034 
1035  bdev->bd_inode->i_size = size;
1036  while (bsize < PAGE_CACHE_SIZE) {
1037  if (size & bsize)
1038  break;
1039  bsize <<= 1;
1040  }
1041  bdev->bd_block_size = bsize;
1042  bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1043 }
1045 
1046 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1047 
1048 /*
1049  * bd_mutex locking:
1050  *
1051  * mutex_lock(part->bd_mutex)
1052  * mutex_lock_nested(whole->bd_mutex, 1)
1053  */
1054 
1055 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1056 {
1057  struct gendisk *disk;
1058  struct module *owner;
1059  int ret;
1060  int partno;
1061  int perm = 0;
1062 
1063  if (mode & FMODE_READ)
1064  perm |= MAY_READ;
1065  if (mode & FMODE_WRITE)
1066  perm |= MAY_WRITE;
1067  /*
1068  * hooks: /n/, see "layering violations".
1069  */
1070  if (!for_part) {
1071  ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1072  if (ret != 0) {
1073  bdput(bdev);
1074  return ret;
1075  }
1076  }
1077 
1078  restart:
1079 
1080  ret = -ENXIO;
1081  disk = get_gendisk(bdev->bd_dev, &partno);
1082  if (!disk)
1083  goto out;
1084  owner = disk->fops->owner;
1085 
1086  disk_block_events(disk);
1087  mutex_lock_nested(&bdev->bd_mutex, for_part);
1088  if (!bdev->bd_openers) {
1089  bdev->bd_disk = disk;
1090  bdev->bd_queue = disk->queue;
1091  bdev->bd_contains = bdev;
1092  if (!partno) {
1093  struct backing_dev_info *bdi;
1094 
1095  ret = -ENXIO;
1096  bdev->bd_part = disk_get_part(disk, partno);
1097  if (!bdev->bd_part)
1098  goto out_clear;
1099 
1100  ret = 0;
1101  if (disk->fops->open) {
1102  ret = disk->fops->open(bdev, mode);
1103  if (ret == -ERESTARTSYS) {
1104  /* Lost a race with 'disk' being
1105  * deleted, try again.
1106  * See md.c
1107  */
1108  disk_put_part(bdev->bd_part);
1109  bdev->bd_part = NULL;
1110  bdev->bd_disk = NULL;
1111  bdev->bd_queue = NULL;
1112  mutex_unlock(&bdev->bd_mutex);
1113  disk_unblock_events(disk);
1114  put_disk(disk);
1115  module_put(owner);
1116  goto restart;
1117  }
1118  }
1119 
1120  if (!ret && !bdev->bd_openers) {
1121  bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1122  bdi = blk_get_backing_dev_info(bdev);
1123  if (bdi == NULL)
1124  bdi = &default_backing_dev_info;
1125  bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1126  }
1127 
1128  /*
1129  * If the device is invalidated, rescan partition
1130  * if open succeeded or failed with -ENOMEDIUM.
1131  * The latter is necessary to prevent ghost
1132  * partitions on a removed medium.
1133  */
1134  if (bdev->bd_invalidated) {
1135  if (!ret)
1136  rescan_partitions(disk, bdev);
1137  else if (ret == -ENOMEDIUM)
1138  invalidate_partitions(disk, bdev);
1139  }
1140  if (ret)
1141  goto out_clear;
1142  } else {
1143  struct block_device *whole;
1144  whole = bdget_disk(disk, 0);
1145  ret = -ENOMEM;
1146  if (!whole)
1147  goto out_clear;
1148  BUG_ON(for_part);
1149  ret = __blkdev_get(whole, mode, 1);
1150  if (ret)
1151  goto out_clear;
1152  bdev->bd_contains = whole;
1153  bdev_inode_switch_bdi(bdev->bd_inode,
1154  whole->bd_inode->i_data.backing_dev_info);
1155  bdev->bd_part = disk_get_part(disk, partno);
1156  if (!(disk->flags & GENHD_FL_UP) ||
1157  !bdev->bd_part || !bdev->bd_part->nr_sects) {
1158  ret = -ENXIO;
1159  goto out_clear;
1160  }
1161  bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1162  }
1163  } else {
1164  if (bdev->bd_contains == bdev) {
1165  ret = 0;
1166  if (bdev->bd_disk->fops->open)
1167  ret = bdev->bd_disk->fops->open(bdev, mode);
1168  /* the same as first opener case, read comment there */
1169  if (bdev->bd_invalidated) {
1170  if (!ret)
1171  rescan_partitions(bdev->bd_disk, bdev);
1172  else if (ret == -ENOMEDIUM)
1173  invalidate_partitions(bdev->bd_disk, bdev);
1174  }
1175  if (ret)
1176  goto out_unlock_bdev;
1177  }
1178  /* only one opener holds refs to the module and disk */
1179  put_disk(disk);
1180  module_put(owner);
1181  }
1182  bdev->bd_openers++;
1183  if (for_part)
1184  bdev->bd_part_count++;
1185  mutex_unlock(&bdev->bd_mutex);
1186  disk_unblock_events(disk);
1187  return 0;
1188 
1189  out_clear:
1190  disk_put_part(bdev->bd_part);
1191  bdev->bd_disk = NULL;
1192  bdev->bd_part = NULL;
1193  bdev->bd_queue = NULL;
1194  bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1195  if (bdev != bdev->bd_contains)
1196  __blkdev_put(bdev->bd_contains, mode, 1);
1197  bdev->bd_contains = NULL;
1198  out_unlock_bdev:
1199  mutex_unlock(&bdev->bd_mutex);
1200  disk_unblock_events(disk);
1201  put_disk(disk);
1202  module_put(owner);
1203  out:
1204  bdput(bdev);
1205 
1206  return ret;
1207 }
1208 
1228 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1229 {
1230  struct block_device *whole = NULL;
1231  int res;
1232 
1233  WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1234 
1235  if ((mode & FMODE_EXCL) && holder) {
1236  whole = bd_start_claiming(bdev, holder);
1237  if (IS_ERR(whole)) {
1238  bdput(bdev);
1239  return PTR_ERR(whole);
1240  }
1241  }
1242 
1243  res = __blkdev_get(bdev, mode, 0);
1244 
1245  if (whole) {
1246  struct gendisk *disk = whole->bd_disk;
1247 
1248  /* finish claiming */
1249  mutex_lock(&bdev->bd_mutex);
1250  spin_lock(&bdev_lock);
1251 
1252  if (!res) {
1253  BUG_ON(!bd_may_claim(bdev, whole, holder));
1254  /*
1255  * Note that for a whole device bd_holders
1256  * will be incremented twice, and bd_holder
1257  * will be set to bd_may_claim before being
1258  * set to holder
1259  */
1260  whole->bd_holders++;
1261  whole->bd_holder = bd_may_claim;
1262  bdev->bd_holders++;
1263  bdev->bd_holder = holder;
1264  }
1265 
1266  /* tell others that we're done */
1267  BUG_ON(whole->bd_claiming != holder);
1268  whole->bd_claiming = NULL;
1269  wake_up_bit(&whole->bd_claiming, 0);
1270 
1271  spin_unlock(&bdev_lock);
1272 
1273  /*
1274  * Block event polling for write claims if requested. Any
1275  * write holder makes the write_holder state stick until
1276  * all are released. This is good enough and tracking
1277  * individual writeable reference is too fragile given the
1278  * way @mode is used in blkdev_get/put().
1279  */
1280  if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1281  (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1282  bdev->bd_write_holder = true;
1283  disk_block_events(disk);
1284  }
1285 
1286  mutex_unlock(&bdev->bd_mutex);
1287  bdput(whole);
1288  }
1289 
1290  return res;
1291 }
1293 
1311 struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1312  void *holder)
1313 {
1314  struct block_device *bdev;
1315  int err;
1316 
1317  bdev = lookup_bdev(path);
1318  if (IS_ERR(bdev))
1319  return bdev;
1320 
1321  err = blkdev_get(bdev, mode, holder);
1322  if (err)
1323  return ERR_PTR(err);
1324 
1325  if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1326  blkdev_put(bdev, mode);
1327  return ERR_PTR(-EACCES);
1328  }
1329 
1330  return bdev;
1331 }
1333 
1356 struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1357 {
1358  struct block_device *bdev;
1359  int err;
1360 
1361  bdev = bdget(dev);
1362  if (!bdev)
1363  return ERR_PTR(-ENOMEM);
1364 
1365  err = blkdev_get(bdev, mode, holder);
1366  if (err)
1367  return ERR_PTR(err);
1368 
1369  return bdev;
1370 }
1372 
1373 static int blkdev_open(struct inode * inode, struct file * filp)
1374 {
1375  struct block_device *bdev;
1376 
1377  /*
1378  * Preserve backwards compatibility and allow large file access
1379  * even if userspace doesn't ask for it explicitly. Some mkfs
1380  * binary needs it. We might want to drop this workaround
1381  * during an unstable branch.
1382  */
1383  filp->f_flags |= O_LARGEFILE;
1384 
1385  if (filp->f_flags & O_NDELAY)
1386  filp->f_mode |= FMODE_NDELAY;
1387  if (filp->f_flags & O_EXCL)
1388  filp->f_mode |= FMODE_EXCL;
1389  if ((filp->f_flags & O_ACCMODE) == 3)
1390  filp->f_mode |= FMODE_WRITE_IOCTL;
1391 
1392  bdev = bd_acquire(inode);
1393  if (bdev == NULL)
1394  return -ENOMEM;
1395 
1396  filp->f_mapping = bdev->bd_inode->i_mapping;
1397 
1398  return blkdev_get(bdev, filp->f_mode, filp);
1399 }
1400 
1401 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1402 {
1403  int ret = 0;
1404  struct gendisk *disk = bdev->bd_disk;
1405  struct block_device *victim = NULL;
1406 
1407  mutex_lock_nested(&bdev->bd_mutex, for_part);
1408  if (for_part)
1409  bdev->bd_part_count--;
1410 
1411  if (!--bdev->bd_openers) {
1412  WARN_ON_ONCE(bdev->bd_holders);
1413  sync_blockdev(bdev);
1414  kill_bdev(bdev);
1415  /* ->release can cause the old bdi to disappear,
1416  * so must switch it out first
1417  */
1418  bdev_inode_switch_bdi(bdev->bd_inode,
1420  }
1421  if (bdev->bd_contains == bdev) {
1422  if (disk->fops->release)
1423  ret = disk->fops->release(disk, mode);
1424  }
1425  if (!bdev->bd_openers) {
1426  struct module *owner = disk->fops->owner;
1427 
1428  disk_put_part(bdev->bd_part);
1429  bdev->bd_part = NULL;
1430  bdev->bd_disk = NULL;
1431  if (bdev != bdev->bd_contains)
1432  victim = bdev->bd_contains;
1433  bdev->bd_contains = NULL;
1434 
1435  put_disk(disk);
1436  module_put(owner);
1437  }
1438  mutex_unlock(&bdev->bd_mutex);
1439  bdput(bdev);
1440  if (victim)
1441  __blkdev_put(victim, mode, 1);
1442  return ret;
1443 }
1444 
1445 int blkdev_put(struct block_device *bdev, fmode_t mode)
1446 {
1447  mutex_lock(&bdev->bd_mutex);
1448 
1449  if (mode & FMODE_EXCL) {
1450  bool bdev_free;
1451 
1452  /*
1453  * Release a claim on the device. The holder fields
1454  * are protected with bdev_lock. bd_mutex is to
1455  * synchronize disk_holder unlinking.
1456  */
1457  spin_lock(&bdev_lock);
1458 
1459  WARN_ON_ONCE(--bdev->bd_holders < 0);
1460  WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1461 
1462  /* bd_contains might point to self, check in a separate step */
1463  if ((bdev_free = !bdev->bd_holders))
1464  bdev->bd_holder = NULL;
1465  if (!bdev->bd_contains->bd_holders)
1466  bdev->bd_contains->bd_holder = NULL;
1467 
1468  spin_unlock(&bdev_lock);
1469 
1470  /*
1471  * If this was the last claim, remove holder link and
1472  * unblock evpoll if it was a write holder.
1473  */
1474  if (bdev_free && bdev->bd_write_holder) {
1476  bdev->bd_write_holder = false;
1477  }
1478  }
1479 
1480  /*
1481  * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1482  * event. This is to ensure detection of media removal commanded
1483  * from userland - e.g. eject(1).
1484  */
1485  disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
1486 
1487  mutex_unlock(&bdev->bd_mutex);
1488 
1489  return __blkdev_put(bdev, mode, 0);
1490 }
1492 
1493 static int blkdev_close(struct inode * inode, struct file * filp)
1494 {
1495  struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1496 
1497  return blkdev_put(bdev, filp->f_mode);
1498 }
1499 
1500 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1501 {
1502  struct block_device *bdev = I_BDEV(file->f_mapping->host);
1503  fmode_t mode = file->f_mode;
1504 
1505  /*
1506  * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1507  * to updated it before every ioctl.
1508  */
1509  if (file->f_flags & O_NDELAY)
1510  mode |= FMODE_NDELAY;
1511  else
1512  mode &= ~FMODE_NDELAY;
1513 
1514  return blkdev_ioctl(bdev, mode, cmd, arg);
1515 }
1516 
1517 /*
1518  * Write data to the block device. Only intended for the block device itself
1519  * and the raw driver which basically is a fake block device.
1520  *
1521  * Does not take i_mutex for the write and thus is not for general purpose
1522  * use.
1523  */
1524 ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1525  unsigned long nr_segs, loff_t pos)
1526 {
1527  struct file *file = iocb->ki_filp;
1528  struct blk_plug plug;
1529  ssize_t ret;
1530 
1531  BUG_ON(iocb->ki_pos != pos);
1532 
1533  blk_start_plug(&plug);
1534  ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1535  if (ret > 0 || ret == -EIOCBQUEUED) {
1536  ssize_t err;
1537 
1538  err = generic_write_sync(file, pos, ret);
1539  if (err < 0 && ret > 0)
1540  ret = err;
1541  }
1542  blk_finish_plug(&plug);
1543  return ret;
1544 }
1546 
1547 static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1548  unsigned long nr_segs, loff_t pos)
1549 {
1550  struct file *file = iocb->ki_filp;
1551  struct inode *bd_inode = file->f_mapping->host;
1552  loff_t size = i_size_read(bd_inode);
1553 
1554  if (pos >= size)
1555  return 0;
1556 
1557  size -= pos;
1558  if (size < INT_MAX)
1559  nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1560  return generic_file_aio_read(iocb, iov, nr_segs, pos);
1561 }
1562 
1563 /*
1564  * Try to release a page associated with block device when the system
1565  * is under memory pressure.
1566  */
1567 static int blkdev_releasepage(struct page *page, gfp_t wait)
1568 {
1569  struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1570 
1571  if (super && super->s_op->bdev_try_to_free_page)
1572  return super->s_op->bdev_try_to_free_page(super, page, wait);
1573 
1574  return try_to_free_buffers(page);
1575 }
1576 
1577 static const struct address_space_operations def_blk_aops = {
1578  .readpage = blkdev_readpage,
1579  .writepage = blkdev_writepage,
1580  .write_begin = blkdev_write_begin,
1581  .write_end = blkdev_write_end,
1582  .writepages = generic_writepages,
1583  .releasepage = blkdev_releasepage,
1584  .direct_IO = blkdev_direct_IO,
1585 };
1586 
1588  .open = blkdev_open,
1589  .release = blkdev_close,
1590  .llseek = block_llseek,
1591  .read = do_sync_read,
1592  .write = do_sync_write,
1593  .aio_read = blkdev_aio_read,
1594  .aio_write = blkdev_aio_write,
1595  .mmap = generic_file_mmap,
1596  .fsync = blkdev_fsync,
1597  .unlocked_ioctl = block_ioctl,
1598 #ifdef CONFIG_COMPAT
1599  .compat_ioctl = compat_blkdev_ioctl,
1600 #endif
1601  .splice_read = generic_file_splice_read,
1602  .splice_write = generic_file_splice_write,
1603 };
1604 
1605 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1606 {
1607  int res;
1608  mm_segment_t old_fs = get_fs();
1609  set_fs(KERNEL_DS);
1610  res = blkdev_ioctl(bdev, 0, cmd, arg);
1611  set_fs(old_fs);
1612  return res;
1613 }
1614 
1616 
1625 struct block_device *lookup_bdev(const char *pathname)
1626 {
1627  struct block_device *bdev;
1628  struct inode *inode;
1629  struct path path;
1630  int error;
1631 
1632  if (!pathname || !*pathname)
1633  return ERR_PTR(-EINVAL);
1634 
1635  error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1636  if (error)
1637  return ERR_PTR(error);
1638 
1639  inode = path.dentry->d_inode;
1640  error = -ENOTBLK;
1641  if (!S_ISBLK(inode->i_mode))
1642  goto fail;
1643  error = -EACCES;
1644  if (path.mnt->mnt_flags & MNT_NODEV)
1645  goto fail;
1646  error = -ENOMEM;
1647  bdev = bd_acquire(inode);
1648  if (!bdev)
1649  goto fail;
1650 out:
1651  path_put(&path);
1652  return bdev;
1653 fail:
1654  bdev = ERR_PTR(error);
1655  goto out;
1656 }
1658 
1659 int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1660 {
1661  struct super_block *sb = get_super(bdev);
1662  int res = 0;
1663 
1664  if (sb) {
1665  /*
1666  * no need to lock the super, get_super holds the
1667  * read mutex so the filesystem cannot go away
1668  * under us (->put_super runs with the write lock
1669  * hold).
1670  */
1671  shrink_dcache_sb(sb);
1672  res = invalidate_inodes(sb, kill_dirty);
1673  drop_super(sb);
1674  }
1675  invalidate_bdev(bdev);
1676  return res;
1677 }
1679 
1680 void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
1681 {
1682  struct inode *inode, *old_inode = NULL;
1683 
1684  spin_lock(&inode_sb_list_lock);
1685  list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1686  struct address_space *mapping = inode->i_mapping;
1687 
1688  spin_lock(&inode->i_lock);
1689  if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
1690  mapping->nrpages == 0) {
1691  spin_unlock(&inode->i_lock);
1692  continue;
1693  }
1694  __iget(inode);
1695  spin_unlock(&inode->i_lock);
1696  spin_unlock(&inode_sb_list_lock);
1697  /*
1698  * We hold a reference to 'inode' so it couldn't have been
1699  * removed from s_inodes list while we dropped the
1700  * inode_sb_list_lock. We cannot iput the inode now as we can
1701  * be holding the last reference and we cannot iput it under
1702  * inode_sb_list_lock. So we keep the reference and iput it
1703  * later.
1704  */
1705  iput(old_inode);
1706  old_inode = inode;
1707 
1708  func(I_BDEV(inode), arg);
1709 
1710  spin_lock(&inode_sb_list_lock);
1711  }
1712  spin_unlock(&inode_sb_list_lock);
1713  iput(old_inode);
1714 }