Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
bio.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001 Jens Axboe <[email protected]>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public Licens
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
16  *
17  */
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 #include <linux/bio.h>
21 #include <linux/blkdev.h>
22 #include <linux/iocontext.h>
23 #include <linux/slab.h>
24 #include <linux/init.h>
25 #include <linux/kernel.h>
26 #include <linux/export.h>
27 #include <linux/mempool.h>
28 #include <linux/workqueue.h>
29 #include <linux/cgroup.h>
30 #include <scsi/sg.h> /* for struct sg_iovec */
31 
32 #include <trace/events/block.h>
33 
34 /*
35  * Test patch to inline a certain number of bi_io_vec's inside the bio
36  * itself, to shrink a bio data allocation from two mempool calls to one
37  */
38 #define BIO_INLINE_VECS 4
39 
40 static mempool_t *bio_split_pool __read_mostly;
41 
42 /*
43  * if you change this list, also change bvec_alloc or things will
44  * break badly! cannot be bigger than what you can fit into an
45  * unsigned short
46  */
47 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
48 static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
49  BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
50 };
51 #undef BV
52 
53 /*
54  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
55  * IO code that does not need private memory pools.
56  */
57 struct bio_set *fs_bio_set;
59 
60 /*
61  * Our slab pool management
62  */
63 struct bio_slab {
64  struct kmem_cache *slab;
65  unsigned int slab_ref;
66  unsigned int slab_size;
67  char name[8];
68 };
69 static DEFINE_MUTEX(bio_slab_lock);
70 static struct bio_slab *bio_slabs;
71 static unsigned int bio_slab_nr, bio_slab_max;
72 
73 static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
74 {
75  unsigned int sz = sizeof(struct bio) + extra_size;
76  struct kmem_cache *slab = NULL;
77  struct bio_slab *bslab, *new_bio_slabs;
78  unsigned int new_bio_slab_max;
79  unsigned int i, entry = -1;
80 
81  mutex_lock(&bio_slab_lock);
82 
83  i = 0;
84  while (i < bio_slab_nr) {
85  bslab = &bio_slabs[i];
86 
87  if (!bslab->slab && entry == -1)
88  entry = i;
89  else if (bslab->slab_size == sz) {
90  slab = bslab->slab;
91  bslab->slab_ref++;
92  break;
93  }
94  i++;
95  }
96 
97  if (slab)
98  goto out_unlock;
99 
100  if (bio_slab_nr == bio_slab_max && entry == -1) {
101  new_bio_slab_max = bio_slab_max << 1;
102  new_bio_slabs = krealloc(bio_slabs,
103  new_bio_slab_max * sizeof(struct bio_slab),
104  GFP_KERNEL);
105  if (!new_bio_slabs)
106  goto out_unlock;
107  bio_slab_max = new_bio_slab_max;
108  bio_slabs = new_bio_slabs;
109  }
110  if (entry == -1)
111  entry = bio_slab_nr++;
112 
113  bslab = &bio_slabs[entry];
114 
115  snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
116  slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
117  if (!slab)
118  goto out_unlock;
119 
120  printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
121  bslab->slab = slab;
122  bslab->slab_ref = 1;
123  bslab->slab_size = sz;
124 out_unlock:
125  mutex_unlock(&bio_slab_lock);
126  return slab;
127 }
128 
129 static void bio_put_slab(struct bio_set *bs)
130 {
131  struct bio_slab *bslab = NULL;
132  unsigned int i;
133 
134  mutex_lock(&bio_slab_lock);
135 
136  for (i = 0; i < bio_slab_nr; i++) {
137  if (bs->bio_slab == bio_slabs[i].slab) {
138  bslab = &bio_slabs[i];
139  break;
140  }
141  }
142 
143  if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
144  goto out;
145 
146  WARN_ON(!bslab->slab_ref);
147 
148  if (--bslab->slab_ref)
149  goto out;
150 
151  kmem_cache_destroy(bslab->slab);
152  bslab->slab = NULL;
153 
154 out:
155  mutex_unlock(&bio_slab_lock);
156 }
157 
158 unsigned int bvec_nr_vecs(unsigned short idx)
159 {
160  return bvec_slabs[idx].nr_vecs;
161 }
162 
163 void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
164 {
165  BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
166 
167  if (idx == BIOVEC_MAX_IDX)
168  mempool_free(bv, bs->bvec_pool);
169  else {
170  struct biovec_slab *bvs = bvec_slabs + idx;
171 
172  kmem_cache_free(bvs->slab, bv);
173  }
174 }
175 
176 struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
177  struct bio_set *bs)
178 {
179  struct bio_vec *bvl;
180 
181  /*
182  * see comment near bvec_array define!
183  */
184  switch (nr) {
185  case 1:
186  *idx = 0;
187  break;
188  case 2 ... 4:
189  *idx = 1;
190  break;
191  case 5 ... 16:
192  *idx = 2;
193  break;
194  case 17 ... 64:
195  *idx = 3;
196  break;
197  case 65 ... 128:
198  *idx = 4;
199  break;
200  case 129 ... BIO_MAX_PAGES:
201  *idx = 5;
202  break;
203  default:
204  return NULL;
205  }
206 
207  /*
208  * idx now points to the pool we want to allocate from. only the
209  * 1-vec entry pool is mempool backed.
210  */
211  if (*idx == BIOVEC_MAX_IDX) {
212 fallback:
213  bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
214  } else {
215  struct biovec_slab *bvs = bvec_slabs + *idx;
216  gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
217 
218  /*
219  * Make this allocation restricted and don't dump info on
220  * allocation failures, since we'll fallback to the mempool
221  * in case of failure.
222  */
223  __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
224 
225  /*
226  * Try a slab allocation. If this fails and __GFP_WAIT
227  * is set, retry with the 1-entry mempool
228  */
229  bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
230  if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
231  *idx = BIOVEC_MAX_IDX;
232  goto fallback;
233  }
234  }
235 
236  return bvl;
237 }
238 
239 static void __bio_free(struct bio *bio)
240 {
241  bio_disassociate_task(bio);
242 
243  if (bio_integrity(bio))
244  bio_integrity_free(bio);
245 }
246 
247 static void bio_free(struct bio *bio)
248 {
249  struct bio_set *bs = bio->bi_pool;
250  void *p;
251 
252  __bio_free(bio);
253 
254  if (bs) {
255  if (bio_has_allocated_vec(bio))
256  bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
257 
258  /*
259  * If we have front padding, adjust the bio pointer before freeing
260  */
261  p = bio;
262  p -= bs->front_pad;
263 
264  mempool_free(p, bs->bio_pool);
265  } else {
266  /* Bio was allocated by bio_kmalloc() */
267  kfree(bio);
268  }
269 }
270 
271 void bio_init(struct bio *bio)
272 {
273  memset(bio, 0, sizeof(*bio));
274  bio->bi_flags = 1 << BIO_UPTODATE;
275  atomic_set(&bio->bi_cnt, 1);
276 }
278 
289 void bio_reset(struct bio *bio)
290 {
291  unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
292 
293  __bio_free(bio);
294 
295  memset(bio, 0, BIO_RESET_BYTES);
296  bio->bi_flags = flags|(1 << BIO_UPTODATE);
297 }
299 
320 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
321 {
322  unsigned front_pad;
323  unsigned inline_vecs;
324  unsigned long idx = BIO_POOL_NONE;
325  struct bio_vec *bvl = NULL;
326  struct bio *bio;
327  void *p;
328 
329  if (!bs) {
330  if (nr_iovecs > UIO_MAXIOV)
331  return NULL;
332 
333  p = kmalloc(sizeof(struct bio) +
334  nr_iovecs * sizeof(struct bio_vec),
335  gfp_mask);
336  front_pad = 0;
337  inline_vecs = nr_iovecs;
338  } else {
339  p = mempool_alloc(bs->bio_pool, gfp_mask);
340  front_pad = bs->front_pad;
341  inline_vecs = BIO_INLINE_VECS;
342  }
343 
344  if (unlikely(!p))
345  return NULL;
346 
347  bio = p + front_pad;
348  bio_init(bio);
349 
350  if (nr_iovecs > inline_vecs) {
351  bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
352  if (unlikely(!bvl))
353  goto err_free;
354  } else if (nr_iovecs) {
355  bvl = bio->bi_inline_vecs;
356  }
357 
358  bio->bi_pool = bs;
359  bio->bi_flags |= idx << BIO_POOL_OFFSET;
360  bio->bi_max_vecs = nr_iovecs;
361  bio->bi_io_vec = bvl;
362  return bio;
363 
364 err_free:
365  mempool_free(p, bs->bio_pool);
366  return NULL;
367 }
369 
370 void zero_fill_bio(struct bio *bio)
371 {
372  unsigned long flags;
373  struct bio_vec *bv;
374  int i;
375 
376  bio_for_each_segment(bv, bio, i) {
377  char *data = bvec_kmap_irq(bv, &flags);
378  memset(data, 0, bv->bv_len);
379  flush_dcache_page(bv->bv_page);
380  bvec_kunmap_irq(data, &flags);
381  }
382 }
384 
393 void bio_put(struct bio *bio)
394 {
395  BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
396 
397  /*
398  * last put frees it
399  */
400  if (atomic_dec_and_test(&bio->bi_cnt))
401  bio_free(bio);
402 }
404 
405 inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
406 {
407  if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
408  blk_recount_segments(q, bio);
409 
410  return bio->bi_phys_segments;
411 }
413 
423 void __bio_clone(struct bio *bio, struct bio *bio_src)
424 {
425  memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
426  bio_src->bi_max_vecs * sizeof(struct bio_vec));
427 
428  /*
429  * most users will be overriding ->bi_bdev with a new target,
430  * so we don't set nor calculate new physical/hw segment counts here
431  */
432  bio->bi_sector = bio_src->bi_sector;
433  bio->bi_bdev = bio_src->bi_bdev;
434  bio->bi_flags |= 1 << BIO_CLONED;
435  bio->bi_rw = bio_src->bi_rw;
436  bio->bi_vcnt = bio_src->bi_vcnt;
437  bio->bi_size = bio_src->bi_size;
438  bio->bi_idx = bio_src->bi_idx;
439 }
441 
450 struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
451  struct bio_set *bs)
452 {
453  struct bio *b;
454 
455  b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
456  if (!b)
457  return NULL;
458 
459  __bio_clone(b, bio);
460 
461  if (bio_integrity(bio)) {
462  int ret;
463 
464  ret = bio_integrity_clone(b, bio, gfp_mask);
465 
466  if (ret < 0) {
467  bio_put(b);
468  return NULL;
469  }
470  }
471 
472  return b;
473 }
475 
485 int bio_get_nr_vecs(struct block_device *bdev)
486 {
487  struct request_queue *q = bdev_get_queue(bdev);
488  int nr_pages;
489 
490  nr_pages = min_t(unsigned,
491  queue_max_segments(q),
492  queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
493 
494  return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
495 
496 }
498 
499 static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
500  *page, unsigned int len, unsigned int offset,
501  unsigned short max_sectors)
502 {
503  int retried_segments = 0;
504  struct bio_vec *bvec;
505 
506  /*
507  * cloned bio must not modify vec list
508  */
509  if (unlikely(bio_flagged(bio, BIO_CLONED)))
510  return 0;
511 
512  if (((bio->bi_size + len) >> 9) > max_sectors)
513  return 0;
514 
515  /*
516  * For filesystems with a blocksize smaller than the pagesize
517  * we will often be called with the same page as last time and
518  * a consecutive offset. Optimize this special case.
519  */
520  if (bio->bi_vcnt > 0) {
521  struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
522 
523  if (page == prev->bv_page &&
524  offset == prev->bv_offset + prev->bv_len) {
525  unsigned int prev_bv_len = prev->bv_len;
526  prev->bv_len += len;
527 
528  if (q->merge_bvec_fn) {
529  struct bvec_merge_data bvm = {
530  /* prev_bvec is already charged in
531  bi_size, discharge it in order to
532  simulate merging updated prev_bvec
533  as new bvec. */
534  .bi_bdev = bio->bi_bdev,
535  .bi_sector = bio->bi_sector,
536  .bi_size = bio->bi_size - prev_bv_len,
537  .bi_rw = bio->bi_rw,
538  };
539 
540  if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
541  prev->bv_len -= len;
542  return 0;
543  }
544  }
545 
546  goto done;
547  }
548  }
549 
550  if (bio->bi_vcnt >= bio->bi_max_vecs)
551  return 0;
552 
553  /*
554  * we might lose a segment or two here, but rather that than
555  * make this too complex.
556  */
557 
558  while (bio->bi_phys_segments >= queue_max_segments(q)) {
559 
560  if (retried_segments)
561  return 0;
562 
563  retried_segments = 1;
564  blk_recount_segments(q, bio);
565  }
566 
567  /*
568  * setup the new entry, we might clear it again later if we
569  * cannot add the page
570  */
571  bvec = &bio->bi_io_vec[bio->bi_vcnt];
572  bvec->bv_page = page;
573  bvec->bv_len = len;
574  bvec->bv_offset = offset;
575 
576  /*
577  * if queue has other restrictions (eg varying max sector size
578  * depending on offset), it can specify a merge_bvec_fn in the
579  * queue to get further control
580  */
581  if (q->merge_bvec_fn) {
582  struct bvec_merge_data bvm = {
583  .bi_bdev = bio->bi_bdev,
584  .bi_sector = bio->bi_sector,
585  .bi_size = bio->bi_size,
586  .bi_rw = bio->bi_rw,
587  };
588 
589  /*
590  * merge_bvec_fn() returns number of bytes it can accept
591  * at this offset
592  */
593  if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
594  bvec->bv_page = NULL;
595  bvec->bv_len = 0;
596  bvec->bv_offset = 0;
597  return 0;
598  }
599  }
600 
601  /* If we may be able to merge these biovecs, force a recount */
602  if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
603  bio->bi_flags &= ~(1 << BIO_SEG_VALID);
604 
605  bio->bi_vcnt++;
606  bio->bi_phys_segments++;
607  done:
608  bio->bi_size += len;
609  return len;
610 }
611 
627 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
628  unsigned int len, unsigned int offset)
629 {
630  return __bio_add_page(q, bio, page, len, offset,
631  queue_max_hw_sectors(q));
632 }
634 
647 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
648  unsigned int offset)
649 {
650  struct request_queue *q = bdev_get_queue(bio->bi_bdev);
651  return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
652 }
654 
655 struct bio_map_data {
656  struct bio_vec *iovecs;
657  struct sg_iovec *sgvecs;
660 };
661 
662 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
663  struct sg_iovec *iov, int iov_count,
664  int is_our_pages)
665 {
666  memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
667  memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
668  bmd->nr_sgvecs = iov_count;
669  bmd->is_our_pages = is_our_pages;
670  bio->bi_private = bmd;
671 }
672 
673 static void bio_free_map_data(struct bio_map_data *bmd)
674 {
675  kfree(bmd->iovecs);
676  kfree(bmd->sgvecs);
677  kfree(bmd);
678 }
679 
680 static struct bio_map_data *bio_alloc_map_data(int nr_segs,
681  unsigned int iov_count,
682  gfp_t gfp_mask)
683 {
684  struct bio_map_data *bmd;
685 
686  if (iov_count > UIO_MAXIOV)
687  return NULL;
688 
689  bmd = kmalloc(sizeof(*bmd), gfp_mask);
690  if (!bmd)
691  return NULL;
692 
693  bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
694  if (!bmd->iovecs) {
695  kfree(bmd);
696  return NULL;
697  }
698 
699  bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
700  if (bmd->sgvecs)
701  return bmd;
702 
703  kfree(bmd->iovecs);
704  kfree(bmd);
705  return NULL;
706 }
707 
708 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
709  struct sg_iovec *iov, int iov_count,
710  int to_user, int from_user, int do_free_page)
711 {
712  int ret = 0, i;
713  struct bio_vec *bvec;
714  int iov_idx = 0;
715  unsigned int iov_off = 0;
716 
717  __bio_for_each_segment(bvec, bio, i, 0) {
718  char *bv_addr = page_address(bvec->bv_page);
719  unsigned int bv_len = iovecs[i].bv_len;
720 
721  while (bv_len && iov_idx < iov_count) {
722  unsigned int bytes;
723  char __user *iov_addr;
724 
725  bytes = min_t(unsigned int,
726  iov[iov_idx].iov_len - iov_off, bv_len);
727  iov_addr = iov[iov_idx].iov_base + iov_off;
728 
729  if (!ret) {
730  if (to_user)
731  ret = copy_to_user(iov_addr, bv_addr,
732  bytes);
733 
734  if (from_user)
735  ret = copy_from_user(bv_addr, iov_addr,
736  bytes);
737 
738  if (ret)
739  ret = -EFAULT;
740  }
741 
742  bv_len -= bytes;
743  bv_addr += bytes;
744  iov_addr += bytes;
745  iov_off += bytes;
746 
747  if (iov[iov_idx].iov_len == iov_off) {
748  iov_idx++;
749  iov_off = 0;
750  }
751  }
752 
753  if (do_free_page)
754  __free_page(bvec->bv_page);
755  }
756 
757  return ret;
758 }
759 
767 int bio_uncopy_user(struct bio *bio)
768 {
769  struct bio_map_data *bmd = bio->bi_private;
770  int ret = 0;
771 
772  if (!bio_flagged(bio, BIO_NULL_MAPPED))
773  ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
774  bmd->nr_sgvecs, bio_data_dir(bio) == READ,
775  0, bmd->is_our_pages);
776  bio_free_map_data(bmd);
777  bio_put(bio);
778  return ret;
779 }
781 
795 struct bio *bio_copy_user_iov(struct request_queue *q,
796  struct rq_map_data *map_data,
797  struct sg_iovec *iov, int iov_count,
798  int write_to_vm, gfp_t gfp_mask)
799 {
800  struct bio_map_data *bmd;
801  struct bio_vec *bvec;
802  struct page *page;
803  struct bio *bio;
804  int i, ret;
805  int nr_pages = 0;
806  unsigned int len = 0;
807  unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
808 
809  for (i = 0; i < iov_count; i++) {
810  unsigned long uaddr;
811  unsigned long end;
812  unsigned long start;
813 
814  uaddr = (unsigned long)iov[i].iov_base;
815  end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
816  start = uaddr >> PAGE_SHIFT;
817 
818  /*
819  * Overflow, abort
820  */
821  if (end < start)
822  return ERR_PTR(-EINVAL);
823 
824  nr_pages += end - start;
825  len += iov[i].iov_len;
826  }
827 
828  if (offset)
829  nr_pages++;
830 
831  bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
832  if (!bmd)
833  return ERR_PTR(-ENOMEM);
834 
835  ret = -ENOMEM;
836  bio = bio_kmalloc(gfp_mask, nr_pages);
837  if (!bio)
838  goto out_bmd;
839 
840  if (!write_to_vm)
841  bio->bi_rw |= REQ_WRITE;
842 
843  ret = 0;
844 
845  if (map_data) {
846  nr_pages = 1 << map_data->page_order;
847  i = map_data->offset / PAGE_SIZE;
848  }
849  while (len) {
850  unsigned int bytes = PAGE_SIZE;
851 
852  bytes -= offset;
853 
854  if (bytes > len)
855  bytes = len;
856 
857  if (map_data) {
858  if (i == map_data->nr_entries * nr_pages) {
859  ret = -ENOMEM;
860  break;
861  }
862 
863  page = map_data->pages[i / nr_pages];
864  page += (i % nr_pages);
865 
866  i++;
867  } else {
868  page = alloc_page(q->bounce_gfp | gfp_mask);
869  if (!page) {
870  ret = -ENOMEM;
871  break;
872  }
873  }
874 
875  if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
876  break;
877 
878  len -= bytes;
879  offset = 0;
880  }
881 
882  if (ret)
883  goto cleanup;
884 
885  /*
886  * success
887  */
888  if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
889  (map_data && map_data->from_user)) {
890  ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
891  if (ret)
892  goto cleanup;
893  }
894 
895  bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
896  return bio;
897 cleanup:
898  if (!map_data)
899  bio_for_each_segment(bvec, bio, i)
900  __free_page(bvec->bv_page);
901 
902  bio_put(bio);
903 out_bmd:
904  bio_free_map_data(bmd);
905  return ERR_PTR(ret);
906 }
907 
921 struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
922  unsigned long uaddr, unsigned int len,
923  int write_to_vm, gfp_t gfp_mask)
924 {
925  struct sg_iovec iov;
926 
927  iov.iov_base = (void __user *)uaddr;
928  iov.iov_len = len;
929 
930  return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
931 }
933 
934 static struct bio *__bio_map_user_iov(struct request_queue *q,
935  struct block_device *bdev,
936  struct sg_iovec *iov, int iov_count,
937  int write_to_vm, gfp_t gfp_mask)
938 {
939  int i, j;
940  int nr_pages = 0;
941  struct page **pages;
942  struct bio *bio;
943  int cur_page = 0;
944  int ret, offset;
945 
946  for (i = 0; i < iov_count; i++) {
947  unsigned long uaddr = (unsigned long)iov[i].iov_base;
948  unsigned long len = iov[i].iov_len;
949  unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
950  unsigned long start = uaddr >> PAGE_SHIFT;
951 
952  /*
953  * Overflow, abort
954  */
955  if (end < start)
956  return ERR_PTR(-EINVAL);
957 
958  nr_pages += end - start;
959  /*
960  * buffer must be aligned to at least hardsector size for now
961  */
962  if (uaddr & queue_dma_alignment(q))
963  return ERR_PTR(-EINVAL);
964  }
965 
966  if (!nr_pages)
967  return ERR_PTR(-EINVAL);
968 
969  bio = bio_kmalloc(gfp_mask, nr_pages);
970  if (!bio)
971  return ERR_PTR(-ENOMEM);
972 
973  ret = -ENOMEM;
974  pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
975  if (!pages)
976  goto out;
977 
978  for (i = 0; i < iov_count; i++) {
979  unsigned long uaddr = (unsigned long)iov[i].iov_base;
980  unsigned long len = iov[i].iov_len;
981  unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
982  unsigned long start = uaddr >> PAGE_SHIFT;
983  const int local_nr_pages = end - start;
984  const int page_limit = cur_page + local_nr_pages;
985 
986  ret = get_user_pages_fast(uaddr, local_nr_pages,
987  write_to_vm, &pages[cur_page]);
988  if (ret < local_nr_pages) {
989  ret = -EFAULT;
990  goto out_unmap;
991  }
992 
993  offset = uaddr & ~PAGE_MASK;
994  for (j = cur_page; j < page_limit; j++) {
995  unsigned int bytes = PAGE_SIZE - offset;
996 
997  if (len <= 0)
998  break;
999 
1000  if (bytes > len)
1001  bytes = len;
1002 
1003  /*
1004  * sorry...
1005  */
1006  if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
1007  bytes)
1008  break;
1009 
1010  len -= bytes;
1011  offset = 0;
1012  }
1013 
1014  cur_page = j;
1015  /*
1016  * release the pages we didn't map into the bio, if any
1017  */
1018  while (j < page_limit)
1019  page_cache_release(pages[j++]);
1020  }
1021 
1022  kfree(pages);
1023 
1024  /*
1025  * set data direction, and check if mapped pages need bouncing
1026  */
1027  if (!write_to_vm)
1028  bio->bi_rw |= REQ_WRITE;
1029 
1030  bio->bi_bdev = bdev;
1031  bio->bi_flags |= (1 << BIO_USER_MAPPED);
1032  return bio;
1033 
1034  out_unmap:
1035  for (i = 0; i < nr_pages; i++) {
1036  if(!pages[i])
1037  break;
1038  page_cache_release(pages[i]);
1039  }
1040  out:
1041  kfree(pages);
1042  bio_put(bio);
1043  return ERR_PTR(ret);
1044 }
1045 
1058 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1059  unsigned long uaddr, unsigned int len, int write_to_vm,
1060  gfp_t gfp_mask)
1061 {
1062  struct sg_iovec iov;
1063 
1064  iov.iov_base = (void __user *)uaddr;
1065  iov.iov_len = len;
1066 
1067  return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1068 }
1070 
1083 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
1084  struct sg_iovec *iov, int iov_count,
1085  int write_to_vm, gfp_t gfp_mask)
1086 {
1087  struct bio *bio;
1088 
1089  bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
1090  gfp_mask);
1091  if (IS_ERR(bio))
1092  return bio;
1093 
1094  /*
1095  * subtle -- if __bio_map_user() ended up bouncing a bio,
1096  * it would normally disappear when its bi_end_io is run.
1097  * however, we need it for the unmap, so grab an extra
1098  * reference to it
1099  */
1100  bio_get(bio);
1101 
1102  return bio;
1103 }
1104 
1105 static void __bio_unmap_user(struct bio *bio)
1106 {
1107  struct bio_vec *bvec;
1108  int i;
1109 
1110  /*
1111  * make sure we dirty pages we wrote to
1112  */
1113  __bio_for_each_segment(bvec, bio, i, 0) {
1114  if (bio_data_dir(bio) == READ)
1115  set_page_dirty_lock(bvec->bv_page);
1116 
1117  page_cache_release(bvec->bv_page);
1118  }
1119 
1120  bio_put(bio);
1121 }
1122 
1132 void bio_unmap_user(struct bio *bio)
1133 {
1134  __bio_unmap_user(bio);
1135  bio_put(bio);
1136 }
1138 
1139 static void bio_map_kern_endio(struct bio *bio, int err)
1140 {
1141  bio_put(bio);
1142 }
1143 
1144 static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1145  unsigned int len, gfp_t gfp_mask)
1146 {
1147  unsigned long kaddr = (unsigned long)data;
1148  unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1149  unsigned long start = kaddr >> PAGE_SHIFT;
1150  const int nr_pages = end - start;
1151  int offset, i;
1152  struct bio *bio;
1153 
1154  bio = bio_kmalloc(gfp_mask, nr_pages);
1155  if (!bio)
1156  return ERR_PTR(-ENOMEM);
1157 
1158  offset = offset_in_page(kaddr);
1159  for (i = 0; i < nr_pages; i++) {
1160  unsigned int bytes = PAGE_SIZE - offset;
1161 
1162  if (len <= 0)
1163  break;
1164 
1165  if (bytes > len)
1166  bytes = len;
1167 
1168  if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
1169  offset) < bytes)
1170  break;
1171 
1172  data += bytes;
1173  len -= bytes;
1174  offset = 0;
1175  }
1176 
1177  bio->bi_end_io = bio_map_kern_endio;
1178  return bio;
1179 }
1180 
1191 struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1192  gfp_t gfp_mask)
1193 {
1194  struct bio *bio;
1195 
1196  bio = __bio_map_kern(q, data, len, gfp_mask);
1197  if (IS_ERR(bio))
1198  return bio;
1199 
1200  if (bio->bi_size == len)
1201  return bio;
1202 
1203  /*
1204  * Don't support partial mappings.
1205  */
1206  bio_put(bio);
1207  return ERR_PTR(-EINVAL);
1208 }
1210 
1211 static void bio_copy_kern_endio(struct bio *bio, int err)
1212 {
1213  struct bio_vec *bvec;
1214  const int read = bio_data_dir(bio) == READ;
1215  struct bio_map_data *bmd = bio->bi_private;
1216  int i;
1217  char *p = bmd->sgvecs[0].iov_base;
1218 
1219  __bio_for_each_segment(bvec, bio, i, 0) {
1220  char *addr = page_address(bvec->bv_page);
1221  int len = bmd->iovecs[i].bv_len;
1222 
1223  if (read)
1224  memcpy(p, addr, len);
1225 
1226  __free_page(bvec->bv_page);
1227  p += len;
1228  }
1229 
1230  bio_free_map_data(bmd);
1231  bio_put(bio);
1232 }
1233 
1245 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1246  gfp_t gfp_mask, int reading)
1247 {
1248  struct bio *bio;
1249  struct bio_vec *bvec;
1250  int i;
1251 
1252  bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1253  if (IS_ERR(bio))
1254  return bio;
1255 
1256  if (!reading) {
1257  void *p = data;
1258 
1259  bio_for_each_segment(bvec, bio, i) {
1260  char *addr = page_address(bvec->bv_page);
1261 
1262  memcpy(addr, p, bvec->bv_len);
1263  p += bvec->bv_len;
1264  }
1265  }
1266 
1267  bio->bi_end_io = bio_copy_kern_endio;
1268 
1269  return bio;
1270 }
1272 
1273 /*
1274  * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1275  * for performing direct-IO in BIOs.
1276  *
1277  * The problem is that we cannot run set_page_dirty() from interrupt context
1278  * because the required locks are not interrupt-safe. So what we can do is to
1279  * mark the pages dirty _before_ performing IO. And in interrupt context,
1280  * check that the pages are still dirty. If so, fine. If not, redirty them
1281  * in process context.
1282  *
1283  * We special-case compound pages here: normally this means reads into hugetlb
1284  * pages. The logic in here doesn't really work right for compound pages
1285  * because the VM does not uniformly chase down the head page in all cases.
1286  * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1287  * handle them at all. So we skip compound pages here at an early stage.
1288  *
1289  * Note that this code is very hard to test under normal circumstances because
1290  * direct-io pins the pages with get_user_pages(). This makes
1291  * is_page_cache_freeable return false, and the VM will not clean the pages.
1292  * But other code (eg, flusher threads) could clean the pages if they are mapped
1293  * pagecache.
1294  *
1295  * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1296  * deferred bio dirtying paths.
1297  */
1298 
1299 /*
1300  * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1301  */
1302 void bio_set_pages_dirty(struct bio *bio)
1303 {
1304  struct bio_vec *bvec = bio->bi_io_vec;
1305  int i;
1306 
1307  for (i = 0; i < bio->bi_vcnt; i++) {
1308  struct page *page = bvec[i].bv_page;
1309 
1310  if (page && !PageCompound(page))
1311  set_page_dirty_lock(page);
1312  }
1313 }
1314 
1315 static void bio_release_pages(struct bio *bio)
1316 {
1317  struct bio_vec *bvec = bio->bi_io_vec;
1318  int i;
1319 
1320  for (i = 0; i < bio->bi_vcnt; i++) {
1321  struct page *page = bvec[i].bv_page;
1322 
1323  if (page)
1324  put_page(page);
1325  }
1326 }
1327 
1328 /*
1329  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1330  * If they are, then fine. If, however, some pages are clean then they must
1331  * have been written out during the direct-IO read. So we take another ref on
1332  * the BIO and the offending pages and re-dirty the pages in process context.
1333  *
1334  * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1335  * here on. It will run one page_cache_release() against each page and will
1336  * run one bio_put() against the BIO.
1337  */
1338 
1339 static void bio_dirty_fn(struct work_struct *work);
1340 
1341 static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1342 static DEFINE_SPINLOCK(bio_dirty_lock);
1343 static struct bio *bio_dirty_list;
1344 
1345 /*
1346  * This runs in process context
1347  */
1348 static void bio_dirty_fn(struct work_struct *work)
1349 {
1350  unsigned long flags;
1351  struct bio *bio;
1352 
1353  spin_lock_irqsave(&bio_dirty_lock, flags);
1354  bio = bio_dirty_list;
1355  bio_dirty_list = NULL;
1356  spin_unlock_irqrestore(&bio_dirty_lock, flags);
1357 
1358  while (bio) {
1359  struct bio *next = bio->bi_private;
1360 
1361  bio_set_pages_dirty(bio);
1362  bio_release_pages(bio);
1363  bio_put(bio);
1364  bio = next;
1365  }
1366 }
1367 
1368 void bio_check_pages_dirty(struct bio *bio)
1369 {
1370  struct bio_vec *bvec = bio->bi_io_vec;
1371  int nr_clean_pages = 0;
1372  int i;
1373 
1374  for (i = 0; i < bio->bi_vcnt; i++) {
1375  struct page *page = bvec[i].bv_page;
1376 
1377  if (PageDirty(page) || PageCompound(page)) {
1378  page_cache_release(page);
1379  bvec[i].bv_page = NULL;
1380  } else {
1381  nr_clean_pages++;
1382  }
1383  }
1384 
1385  if (nr_clean_pages) {
1386  unsigned long flags;
1387 
1388  spin_lock_irqsave(&bio_dirty_lock, flags);
1389  bio->bi_private = bio_dirty_list;
1390  bio_dirty_list = bio;
1391  spin_unlock_irqrestore(&bio_dirty_lock, flags);
1392  schedule_work(&bio_dirty_work);
1393  } else {
1394  bio_put(bio);
1395  }
1396 }
1397 
1398 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1399 void bio_flush_dcache_pages(struct bio *bi)
1400 {
1401  int i;
1402  struct bio_vec *bvec;
1403 
1404  bio_for_each_segment(bvec, bi, i)
1405  flush_dcache_page(bvec->bv_page);
1406 }
1407 EXPORT_SYMBOL(bio_flush_dcache_pages);
1408 #endif
1409 
1424 void bio_endio(struct bio *bio, int error)
1425 {
1426  if (error)
1427  clear_bit(BIO_UPTODATE, &bio->bi_flags);
1428  else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1429  error = -EIO;
1430 
1431  if (bio->bi_end_io)
1432  bio->bi_end_io(bio, error);
1433 }
1435 
1436 void bio_pair_release(struct bio_pair *bp)
1437 {
1438  if (atomic_dec_and_test(&bp->cnt)) {
1439  struct bio *master = bp->bio1.bi_private;
1440 
1441  bio_endio(master, bp->error);
1442  mempool_free(bp, bp->bio2.bi_private);
1443  }
1444 }
1446 
1447 static void bio_pair_end_1(struct bio *bi, int err)
1448 {
1449  struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
1450 
1451  if (err)
1452  bp->error = err;
1453 
1454  bio_pair_release(bp);
1455 }
1456 
1457 static void bio_pair_end_2(struct bio *bi, int err)
1458 {
1459  struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
1460 
1461  if (err)
1462  bp->error = err;
1463 
1464  bio_pair_release(bp);
1465 }
1466 
1467 /*
1468  * split a bio - only worry about a bio with a single page in its iovec
1469  */
1470 struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1471 {
1472  struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
1473 
1474  if (!bp)
1475  return bp;
1476 
1477  trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1478  bi->bi_sector + first_sectors);
1479 
1480  BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
1481  BUG_ON(bi->bi_idx != 0);
1482  atomic_set(&bp->cnt, 3);
1483  bp->error = 0;
1484  bp->bio1 = *bi;
1485  bp->bio2 = *bi;
1486  bp->bio2.bi_sector += first_sectors;
1487  bp->bio2.bi_size -= first_sectors << 9;
1488  bp->bio1.bi_size = first_sectors << 9;
1489 
1490  if (bi->bi_vcnt != 0) {
1491  bp->bv1 = bi->bi_io_vec[0];
1492  bp->bv2 = bi->bi_io_vec[0];
1493 
1494  if (bio_is_rw(bi)) {
1495  bp->bv2.bv_offset += first_sectors << 9;
1496  bp->bv2.bv_len -= first_sectors << 9;
1497  bp->bv1.bv_len = first_sectors << 9;
1498  }
1499 
1500  bp->bio1.bi_io_vec = &bp->bv1;
1501  bp->bio2.bi_io_vec = &bp->bv2;
1502 
1503  bp->bio1.bi_max_vecs = 1;
1504  bp->bio2.bi_max_vecs = 1;
1505  }
1506 
1507  bp->bio1.bi_end_io = bio_pair_end_1;
1508  bp->bio2.bi_end_io = bio_pair_end_2;
1509 
1510  bp->bio1.bi_private = bi;
1511  bp->bio2.bi_private = bio_split_pool;
1512 
1513  if (bio_integrity(bi))
1514  bio_integrity_split(bi, bp, first_sectors);
1515 
1516  return bp;
1517 }
1519 
1530 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1531  unsigned int offset)
1532 {
1533  unsigned int sector_sz;
1534  struct bio_vec *bv;
1535  sector_t sectors;
1536  int i;
1537 
1538  sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
1539  sectors = 0;
1540 
1541  if (index >= bio->bi_idx)
1542  index = bio->bi_vcnt - 1;
1543 
1544  __bio_for_each_segment(bv, bio, i, 0) {
1545  if (i == index) {
1546  if (offset > bv->bv_offset)
1547  sectors += (offset - bv->bv_offset) / sector_sz;
1548  break;
1549  }
1550 
1551  sectors += bv->bv_len / sector_sz;
1552  }
1553 
1554  return sectors;
1555 }
1557 
1558 /*
1559  * create memory pools for biovec's in a bio_set.
1560  * use the global biovec slabs created for general use.
1561  */
1562 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
1563 {
1564  struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1565 
1566  bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
1567  if (!bs->bvec_pool)
1568  return -ENOMEM;
1569 
1570  return 0;
1571 }
1572 
1573 static void biovec_free_pools(struct bio_set *bs)
1574 {
1575  mempool_destroy(bs->bvec_pool);
1576 }
1577 
1578 void bioset_free(struct bio_set *bs)
1579 {
1580  if (bs->bio_pool)
1581  mempool_destroy(bs->bio_pool);
1582 
1584  biovec_free_pools(bs);
1585  bio_put_slab(bs);
1586 
1587  kfree(bs);
1588 }
1590 
1604 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1605 {
1606  unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1607  struct bio_set *bs;
1608 
1609  bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1610  if (!bs)
1611  return NULL;
1612 
1613  bs->front_pad = front_pad;
1614 
1615  bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1616  if (!bs->bio_slab) {
1617  kfree(bs);
1618  return NULL;
1619  }
1620 
1621  bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
1622  if (!bs->bio_pool)
1623  goto bad;
1624 
1625  if (!biovec_create_pools(bs, pool_size))
1626  return bs;
1627 
1628 bad:
1629  bioset_free(bs);
1630  return NULL;
1631 }
1633 
1634 #ifdef CONFIG_BLK_CGROUP
1635 
1648 int bio_associate_current(struct bio *bio)
1649 {
1650  struct io_context *ioc;
1651  struct cgroup_subsys_state *css;
1652 
1653  if (bio->bi_ioc)
1654  return -EBUSY;
1655 
1656  ioc = current->io_context;
1657  if (!ioc)
1658  return -ENOENT;
1659 
1660  /* acquire active ref on @ioc and associate */
1661  get_io_context_active(ioc);
1662  bio->bi_ioc = ioc;
1663 
1664  /* associate blkcg if exists */
1665  rcu_read_lock();
1666  css = task_subsys_state(current, blkio_subsys_id);
1667  if (css && css_tryget(css))
1668  bio->bi_css = css;
1669  rcu_read_unlock();
1670 
1671  return 0;
1672 }
1673 
1678 void bio_disassociate_task(struct bio *bio)
1679 {
1680  if (bio->bi_ioc) {
1681  put_io_context(bio->bi_ioc);
1682  bio->bi_ioc = NULL;
1683  }
1684  if (bio->bi_css) {
1685  css_put(bio->bi_css);
1686  bio->bi_css = NULL;
1687  }
1688 }
1689 
1690 #endif /* CONFIG_BLK_CGROUP */
1691 
1692 static void __init biovec_init_slabs(void)
1693 {
1694  int i;
1695 
1696  for (i = 0; i < BIOVEC_NR_POOLS; i++) {
1697  int size;
1698  struct biovec_slab *bvs = bvec_slabs + i;
1699 
1700  if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1701  bvs->slab = NULL;
1702  continue;
1703  }
1704 
1705  size = bvs->nr_vecs * sizeof(struct bio_vec);
1706  bvs->slab = kmem_cache_create(bvs->name, size, 0,
1708  }
1709 }
1710 
1711 static int __init init_bio(void)
1712 {
1713  bio_slab_max = 2;
1714  bio_slab_nr = 0;
1715  bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
1716  if (!bio_slabs)
1717  panic("bio: can't allocate bios\n");
1718 
1720  biovec_init_slabs();
1721 
1722  fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
1723  if (!fs_bio_set)
1724  panic("bio: can't allocate bios\n");
1725 
1726  if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
1727  panic("bio: can't create integrity pool\n");
1728 
1729  bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
1730  sizeof(struct bio_pair));
1731  if (!bio_split_pool)
1732  panic("bio: can't create split pool\n");
1733 
1734  return 0;
1735 }
1736 subsys_initcall(init_bio);