Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dm-thin.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2011-2012 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 #include "dm-bio-prison.h"
9 #include "dm.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/list.h>
15 #include <linux/init.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 
19 #define DM_MSG_PREFIX "thin"
20 
21 /*
22  * Tunable constants
23  */
24 #define ENDIO_HOOK_POOL_SIZE 1024
25 #define MAPPING_POOL_SIZE 1024
26 #define PRISON_CELLS 1024
27 #define COMMIT_PERIOD HZ
28 
29 /*
30  * The block size of the device holding pool data must be
31  * between 64KB and 1GB.
32  */
33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
35 
36 /*
37  * Device id is restricted to 24 bits.
38  */
39 #define MAX_DEV_ID ((1 << 24) - 1)
40 
41 /*
42  * How do we handle breaking sharing of data blocks?
43  * =================================================
44  *
45  * We use a standard copy-on-write btree to store the mappings for the
46  * devices (note I'm talking about copy-on-write of the metadata here, not
47  * the data). When you take an internal snapshot you clone the root node
48  * of the origin btree. After this there is no concept of an origin or a
49  * snapshot. They are just two device trees that happen to point to the
50  * same data blocks.
51  *
52  * When we get a write in we decide if it's to a shared data block using
53  * some timestamp magic. If it is, we have to break sharing.
54  *
55  * Let's say we write to a shared block in what was the origin. The
56  * steps are:
57  *
58  * i) plug io further to this physical block. (see bio_prison code).
59  *
60  * ii) quiesce any read io to that shared data block. Obviously
61  * including all devices that share this block. (see dm_deferred_set code)
62  *
63  * iii) copy the data block to a newly allocate block. This step can be
64  * missed out if the io covers the block. (schedule_copy).
65  *
66  * iv) insert the new mapping into the origin's btree
67  * (process_prepared_mapping). This act of inserting breaks some
68  * sharing of btree nodes between the two devices. Breaking sharing only
69  * effects the btree of that specific device. Btrees for the other
70  * devices that share the block never change. The btree for the origin
71  * device as it was after the last commit is untouched, ie. we're using
72  * persistent data structures in the functional programming sense.
73  *
74  * v) unplug io to this physical block, including the io that triggered
75  * the breaking of sharing.
76  *
77  * Steps (ii) and (iii) occur in parallel.
78  *
79  * The metadata _doesn't_ need to be committed before the io continues. We
80  * get away with this because the io is always written to a _new_ block.
81  * If there's a crash, then:
82  *
83  * - The origin mapping will point to the old origin block (the shared
84  * one). This will contain the data as it was before the io that triggered
85  * the breaking of sharing came in.
86  *
87  * - The snap mapping still points to the old block. As it would after
88  * the commit.
89  *
90  * The downside of this scheme is the timestamp magic isn't perfect, and
91  * will continue to think that data block in the snapshot device is shared
92  * even after the write to the origin has broken sharing. I suspect data
93  * blocks will typically be shared by many different devices, so we're
94  * breaking sharing n + 1 times, rather than n, where n is the number of
95  * devices that reference this data block. At the moment I think the
96  * benefits far, far outweigh the disadvantages.
97  */
98 
99 /*----------------------------------------------------------------*/
100 
101 /*
102  * Key building.
103  */
104 static void build_data_key(struct dm_thin_device *td,
105  dm_block_t b, struct dm_cell_key *key)
106 {
107  key->virtual = 0;
108  key->dev = dm_thin_dev_id(td);
109  key->block = b;
110 }
111 
112 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
113  struct dm_cell_key *key)
114 {
115  key->virtual = 1;
116  key->dev = dm_thin_dev_id(td);
117  key->block = b;
118 }
119 
120 /*----------------------------------------------------------------*/
121 
122 /*
123  * A pool device ties together a metadata device and a data device. It
124  * also provides the interface for creating and destroying internal
125  * devices.
126  */
127 struct dm_thin_new_mapping;
128 
129 /*
130  * The pool runs in 3 modes. Ordered in degraded order for comparisons.
131  */
132 enum pool_mode {
133  PM_WRITE, /* metadata may be changed */
134  PM_READ_ONLY, /* metadata may not be changed */
135  PM_FAIL, /* all I/O fails */
136 };
137 
140 
144 };
145 
146 struct thin_c;
147 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
149 
150 struct pool {
151  struct list_head list;
152  struct dm_target *ti; /* Only set if a pool target is bound */
153 
157 
161 
163  unsigned low_water_triggered:1; /* A dm event has been sent */
164  unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
165 
168 
172 
173  unsigned long last_commit_jiffies;
174  unsigned ref_count;
175 
181 
183 
186 
190 
193 
196 };
197 
198 static enum pool_mode get_pool_mode(struct pool *pool);
199 static void set_pool_mode(struct pool *pool, enum pool_mode mode);
200 
201 /*
202  * Target context for a pool.
203  */
204 struct pool_c {
205  struct dm_target *ti;
206  struct pool *pool;
207  struct dm_dev *data_dev;
210 
212  struct pool_features requested_pf; /* Features requested during table load */
213  struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
214 };
215 
216 /*
217  * Target context for a thin.
218  */
219 struct thin_c {
220  struct dm_dev *pool_dev;
223 
224  struct pool *pool;
226 };
227 
228 /*----------------------------------------------------------------*/
229 
230 /*
231  * A global list of pools that uses a struct mapped_device as a key.
232  */
233 static struct dm_thin_pool_table {
234  struct mutex mutex;
235  struct list_head pools;
236 } dm_thin_pool_table;
237 
238 static void pool_table_init(void)
239 {
240  mutex_init(&dm_thin_pool_table.mutex);
241  INIT_LIST_HEAD(&dm_thin_pool_table.pools);
242 }
243 
244 static void __pool_table_insert(struct pool *pool)
245 {
246  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
247  list_add(&pool->list, &dm_thin_pool_table.pools);
248 }
249 
250 static void __pool_table_remove(struct pool *pool)
251 {
252  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
253  list_del(&pool->list);
254 }
255 
256 static struct pool *__pool_table_lookup(struct mapped_device *md)
257 {
258  struct pool *pool = NULL, *tmp;
259 
260  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
261 
262  list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
263  if (tmp->pool_md == md) {
264  pool = tmp;
265  break;
266  }
267  }
268 
269  return pool;
270 }
271 
272 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
273 {
274  struct pool *pool = NULL, *tmp;
275 
276  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
277 
278  list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
279  if (tmp->md_dev == md_dev) {
280  pool = tmp;
281  break;
282  }
283  }
284 
285  return pool;
286 }
287 
288 /*----------------------------------------------------------------*/
289 
291  struct thin_c *tc;
295 };
296 
297 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
298 {
299  struct bio *bio;
300  struct bio_list bios;
301 
302  bio_list_init(&bios);
303  bio_list_merge(&bios, master);
304  bio_list_init(master);
305 
306  while ((bio = bio_list_pop(&bios))) {
307  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
308 
309  if (h->tc == tc)
311  else
312  bio_list_add(master, bio);
313  }
314 }
315 
316 static void requeue_io(struct thin_c *tc)
317 {
318  struct pool *pool = tc->pool;
319  unsigned long flags;
320 
321  spin_lock_irqsave(&pool->lock, flags);
322  __requeue_bio_list(tc, &pool->deferred_bios);
323  __requeue_bio_list(tc, &pool->retry_on_resume_list);
324  spin_unlock_irqrestore(&pool->lock, flags);
325 }
326 
327 /*
328  * This section of code contains the logic for processing a thin device's IO.
329  * Much of the code depends on pool object resources (lists, workqueues, etc)
330  * but most is exclusively called from the thin target rather than the thin-pool
331  * target.
332  */
333 
334 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
335 {
336  sector_t block_nr = bio->bi_sector;
337 
338  if (tc->pool->sectors_per_block_shift < 0)
339  (void) sector_div(block_nr, tc->pool->sectors_per_block);
340  else
341  block_nr >>= tc->pool->sectors_per_block_shift;
342 
343  return block_nr;
344 }
345 
346 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
347 {
348  struct pool *pool = tc->pool;
349  sector_t bi_sector = bio->bi_sector;
350 
351  bio->bi_bdev = tc->pool_dev->bdev;
352  if (tc->pool->sectors_per_block_shift < 0)
353  bio->bi_sector = (block * pool->sectors_per_block) +
354  sector_div(bi_sector, pool->sectors_per_block);
355  else
356  bio->bi_sector = (block << pool->sectors_per_block_shift) |
357  (bi_sector & (pool->sectors_per_block - 1));
358 }
359 
360 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
361 {
362  bio->bi_bdev = tc->origin_dev->bdev;
363 }
364 
365 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
366 {
367  return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
369 }
370 
371 static void issue(struct thin_c *tc, struct bio *bio)
372 {
373  struct pool *pool = tc->pool;
374  unsigned long flags;
375 
376  if (!bio_triggers_commit(tc, bio)) {
378  return;
379  }
380 
381  /*
382  * Complete bio with an error if earlier I/O caused changes to
383  * the metadata that can't be committed e.g, due to I/O errors
384  * on the metadata device.
385  */
386  if (dm_thin_aborted_changes(tc->td)) {
387  bio_io_error(bio);
388  return;
389  }
390 
391  /*
392  * Batch together any bios that trigger commits and then issue a
393  * single commit for them in process_deferred_bios().
394  */
395  spin_lock_irqsave(&pool->lock, flags);
396  bio_list_add(&pool->deferred_flush_bios, bio);
397  spin_unlock_irqrestore(&pool->lock, flags);
398 }
399 
400 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
401 {
402  remap_to_origin(tc, bio);
403  issue(tc, bio);
404 }
405 
406 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
407  dm_block_t block)
408 {
409  remap(tc, bio, block);
410  issue(tc, bio);
411 }
412 
413 /*
414  * wake_worker() is used when new work is queued and when pool_resume is
415  * ready to continue deferred IO processing.
416  */
417 static void wake_worker(struct pool *pool)
418 {
419  queue_work(pool->wq, &pool->worker);
420 }
421 
422 /*----------------------------------------------------------------*/
423 
424 /*
425  * Bio endio functions.
426  */
428  struct list_head list;
429 
430  unsigned quiesced:1;
431  unsigned prepared:1;
432  unsigned pass_discard:1;
433 
434  struct thin_c *tc;
438  int err;
439 
440  /*
441  * If the bio covers the whole area of a block then we can avoid
442  * zeroing or copying. Instead this bio is hooked. The bio will
443  * still be in the cell, so care has to be taken to avoid issuing
444  * the bio twice.
445  */
446  struct bio *bio;
447  bio_end_io_t *saved_bi_end_io;
448 };
449 
450 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
451 {
452  struct pool *pool = m->tc->pool;
453 
454  if (m->quiesced && m->prepared) {
455  list_add(&m->list, &pool->prepared_mappings);
456  wake_worker(pool);
457  }
458 }
459 
460 static void copy_complete(int read_err, unsigned long write_err, void *context)
461 {
462  unsigned long flags;
463  struct dm_thin_new_mapping *m = context;
464  struct pool *pool = m->tc->pool;
465 
466  m->err = read_err || write_err ? -EIO : 0;
467 
468  spin_lock_irqsave(&pool->lock, flags);
469  m->prepared = 1;
470  __maybe_add_mapping(m);
471  spin_unlock_irqrestore(&pool->lock, flags);
472 }
473 
474 static void overwrite_endio(struct bio *bio, int err)
475 {
476  unsigned long flags;
477  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
479  struct pool *pool = m->tc->pool;
480 
481  m->err = err;
482 
483  spin_lock_irqsave(&pool->lock, flags);
484  m->prepared = 1;
485  __maybe_add_mapping(m);
486  spin_unlock_irqrestore(&pool->lock, flags);
487 }
488 
489 /*----------------------------------------------------------------*/
490 
491 /*
492  * Workqueue.
493  */
494 
495 /*
496  * Prepared mapping jobs.
497  */
498 
499 /*
500  * This sends the bios in the cell back to the deferred_bios list.
501  */
502 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
503  dm_block_t data_block)
504 {
505  struct pool *pool = tc->pool;
506  unsigned long flags;
507 
508  spin_lock_irqsave(&pool->lock, flags);
509  dm_cell_release(cell, &pool->deferred_bios);
510  spin_unlock_irqrestore(&tc->pool->lock, flags);
511 
512  wake_worker(pool);
513 }
514 
515 /*
516  * Same as cell_defer above, except it omits one particular detainee,
517  * a write bio that covers the block and has already been processed.
518  */
519 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
520 {
521  struct bio_list bios;
522  struct pool *pool = tc->pool;
523  unsigned long flags;
524 
525  bio_list_init(&bios);
526 
527  spin_lock_irqsave(&pool->lock, flags);
529  spin_unlock_irqrestore(&pool->lock, flags);
530 
531  wake_worker(pool);
532 }
533 
534 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
535 {
536  if (m->bio)
537  m->bio->bi_end_io = m->saved_bi_end_io;
538  dm_cell_error(m->cell);
539  list_del(&m->list);
540  mempool_free(m, m->tc->pool->mapping_pool);
541 }
542 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
543 {
544  struct thin_c *tc = m->tc;
545  struct bio *bio;
546  int r;
547 
548  bio = m->bio;
549  if (bio)
550  bio->bi_end_io = m->saved_bi_end_io;
551 
552  if (m->err) {
553  dm_cell_error(m->cell);
554  goto out;
555  }
556 
557  /*
558  * Commit the prepared block into the mapping btree.
559  * Any I/O for this block arriving after this point will get
560  * remapped to it directly.
561  */
562  r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
563  if (r) {
564  DMERR("dm_thin_insert_block() failed");
565  dm_cell_error(m->cell);
566  goto out;
567  }
568 
569  /*
570  * Release any bios held while the block was being provisioned.
571  * If we are processing a write bio that completely covers the block,
572  * we already processed it so can ignore it now when processing
573  * the bios in the cell.
574  */
575  if (bio) {
576  cell_defer_except(tc, m->cell);
577  bio_endio(bio, 0);
578  } else
579  cell_defer(tc, m->cell, m->data_block);
580 
581 out:
582  list_del(&m->list);
583  mempool_free(m, tc->pool->mapping_pool);
584 }
585 
586 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
587 {
588  struct thin_c *tc = m->tc;
589 
590  bio_io_error(m->bio);
591  cell_defer_except(tc, m->cell);
592  cell_defer_except(tc, m->cell2);
593  mempool_free(m, tc->pool->mapping_pool);
594 }
595 
596 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
597 {
598  struct thin_c *tc = m->tc;
599 
600  if (m->pass_discard)
601  remap_and_issue(tc, m->bio, m->data_block);
602  else
603  bio_endio(m->bio, 0);
604 
605  cell_defer_except(tc, m->cell);
606  cell_defer_except(tc, m->cell2);
607  mempool_free(m, tc->pool->mapping_pool);
608 }
609 
610 static void process_prepared_discard(struct dm_thin_new_mapping *m)
611 {
612  int r;
613  struct thin_c *tc = m->tc;
614 
615  r = dm_thin_remove_block(tc->td, m->virt_block);
616  if (r)
617  DMERR("dm_thin_remove_block() failed");
618 
619  process_prepared_discard_passdown(m);
620 }
621 
622 static void process_prepared(struct pool *pool, struct list_head *head,
624 {
625  unsigned long flags;
626  struct list_head maps;
627  struct dm_thin_new_mapping *m, *tmp;
628 
629  INIT_LIST_HEAD(&maps);
630  spin_lock_irqsave(&pool->lock, flags);
631  list_splice_init(head, &maps);
632  spin_unlock_irqrestore(&pool->lock, flags);
633 
634  list_for_each_entry_safe(m, tmp, &maps, list)
635  (*fn)(m);
636 }
637 
638 /*
639  * Deferred bio jobs.
640  */
641 static int io_overlaps_block(struct pool *pool, struct bio *bio)
642 {
643  return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
644 }
645 
646 static int io_overwrites_block(struct pool *pool, struct bio *bio)
647 {
648  return (bio_data_dir(bio) == WRITE) &&
649  io_overlaps_block(pool, bio);
650 }
651 
652 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
653  bio_end_io_t *fn)
654 {
655  *save = bio->bi_end_io;
656  bio->bi_end_io = fn;
657 }
658 
659 static int ensure_next_mapping(struct pool *pool)
660 {
661  if (pool->next_mapping)
662  return 0;
663 
665 
666  return pool->next_mapping ? 0 : -ENOMEM;
667 }
668 
669 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
670 {
671  struct dm_thin_new_mapping *r = pool->next_mapping;
672 
673  BUG_ON(!pool->next_mapping);
674 
675  pool->next_mapping = NULL;
676 
677  return r;
678 }
679 
680 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
681  struct dm_dev *origin, dm_block_t data_origin,
682  dm_block_t data_dest,
683  struct dm_bio_prison_cell *cell, struct bio *bio)
684 {
685  int r;
686  struct pool *pool = tc->pool;
687  struct dm_thin_new_mapping *m = get_next_mapping(pool);
688 
689  INIT_LIST_HEAD(&m->list);
690  m->quiesced = 0;
691  m->prepared = 0;
692  m->tc = tc;
693  m->virt_block = virt_block;
694  m->data_block = data_dest;
695  m->cell = cell;
696  m->err = 0;
697  m->bio = NULL;
698 
700  m->quiesced = 1;
701 
702  /*
703  * IO to pool_dev remaps to the pool target's data_dev.
704  *
705  * If the whole block of data is being overwritten, we can issue the
706  * bio immediately. Otherwise we use kcopyd to clone the data first.
707  */
708  if (io_overwrites_block(pool, bio)) {
709  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
710 
711  h->overwrite_mapping = m;
712  m->bio = bio;
713  save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
714  remap_and_issue(tc, bio, data_dest);
715  } else {
716  struct dm_io_region from, to;
717 
718  from.bdev = origin->bdev;
719  from.sector = data_origin * pool->sectors_per_block;
720  from.count = pool->sectors_per_block;
721 
722  to.bdev = tc->pool_dev->bdev;
723  to.sector = data_dest * pool->sectors_per_block;
724  to.count = pool->sectors_per_block;
725 
726  r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
727  0, copy_complete, m);
728  if (r < 0) {
729  mempool_free(m, pool->mapping_pool);
730  DMERR("dm_kcopyd_copy() failed");
731  dm_cell_error(cell);
732  }
733  }
734 }
735 
736 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
737  dm_block_t data_origin, dm_block_t data_dest,
738  struct dm_bio_prison_cell *cell, struct bio *bio)
739 {
740  schedule_copy(tc, virt_block, tc->pool_dev,
741  data_origin, data_dest, cell, bio);
742 }
743 
744 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
745  dm_block_t data_dest,
746  struct dm_bio_prison_cell *cell, struct bio *bio)
747 {
748  schedule_copy(tc, virt_block, tc->origin_dev,
749  virt_block, data_dest, cell, bio);
750 }
751 
752 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
753  dm_block_t data_block, struct dm_bio_prison_cell *cell,
754  struct bio *bio)
755 {
756  struct pool *pool = tc->pool;
757  struct dm_thin_new_mapping *m = get_next_mapping(pool);
758 
759  INIT_LIST_HEAD(&m->list);
760  m->quiesced = 1;
761  m->prepared = 0;
762  m->tc = tc;
763  m->virt_block = virt_block;
764  m->data_block = data_block;
765  m->cell = cell;
766  m->err = 0;
767  m->bio = NULL;
768 
769  /*
770  * If the whole block of data is being overwritten or we are not
771  * zeroing pre-existing data, we can issue the bio immediately.
772  * Otherwise we use kcopyd to zero the data first.
773  */
774  if (!pool->pf.zero_new_blocks)
775  process_prepared_mapping(m);
776 
777  else if (io_overwrites_block(pool, bio)) {
778  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
779 
780  h->overwrite_mapping = m;
781  m->bio = bio;
782  save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
783  remap_and_issue(tc, bio, data_block);
784  } else {
785  int r;
786  struct dm_io_region to;
787 
788  to.bdev = tc->pool_dev->bdev;
789  to.sector = data_block * pool->sectors_per_block;
790  to.count = pool->sectors_per_block;
791 
792  r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
793  if (r < 0) {
794  mempool_free(m, pool->mapping_pool);
795  DMERR("dm_kcopyd_zero() failed");
796  dm_cell_error(cell);
797  }
798  }
799 }
800 
801 static int commit(struct pool *pool)
802 {
803  int r;
804 
805  r = dm_pool_commit_metadata(pool->pmd);
806  if (r)
807  DMERR("commit failed, error = %d", r);
808 
809  return r;
810 }
811 
812 /*
813  * A non-zero return indicates read_only or fail_io mode.
814  * Many callers don't care about the return value.
815  */
816 static int commit_or_fallback(struct pool *pool)
817 {
818  int r;
819 
820  if (get_pool_mode(pool) != PM_WRITE)
821  return -EINVAL;
822 
823  r = commit(pool);
824  if (r)
825  set_pool_mode(pool, PM_READ_ONLY);
826 
827  return r;
828 }
829 
830 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
831 {
832  int r;
833  dm_block_t free_blocks;
834  unsigned long flags;
835  struct pool *pool = tc->pool;
836 
837  r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
838  if (r)
839  return r;
840 
841  if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
842  DMWARN("%s: reached low water mark, sending event.",
843  dm_device_name(pool->pool_md));
844  spin_lock_irqsave(&pool->lock, flags);
845  pool->low_water_triggered = 1;
846  spin_unlock_irqrestore(&pool->lock, flags);
847  dm_table_event(pool->ti->table);
848  }
849 
850  if (!free_blocks) {
851  if (pool->no_free_space)
852  return -ENOSPC;
853  else {
854  /*
855  * Try to commit to see if that will free up some
856  * more space.
857  */
858  (void) commit_or_fallback(pool);
859 
860  r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
861  if (r)
862  return r;
863 
864  /*
865  * If we still have no space we set a flag to avoid
866  * doing all this checking and return -ENOSPC.
867  */
868  if (!free_blocks) {
869  DMWARN("%s: no free space available.",
870  dm_device_name(pool->pool_md));
871  spin_lock_irqsave(&pool->lock, flags);
872  pool->no_free_space = 1;
873  spin_unlock_irqrestore(&pool->lock, flags);
874  return -ENOSPC;
875  }
876  }
877  }
878 
879  r = dm_pool_alloc_data_block(pool->pmd, result);
880  if (r)
881  return r;
882 
883  return 0;
884 }
885 
886 /*
887  * If we have run out of space, queue bios until the device is
888  * resumed, presumably after having been reloaded with more space.
889  */
890 static void retry_on_resume(struct bio *bio)
891 {
892  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
893  struct thin_c *tc = h->tc;
894  struct pool *pool = tc->pool;
895  unsigned long flags;
896 
897  spin_lock_irqsave(&pool->lock, flags);
898  bio_list_add(&pool->retry_on_resume_list, bio);
899  spin_unlock_irqrestore(&pool->lock, flags);
900 }
901 
902 static void no_space(struct dm_bio_prison_cell *cell)
903 {
904  struct bio *bio;
905  struct bio_list bios;
906 
907  bio_list_init(&bios);
908  dm_cell_release(cell, &bios);
909 
910  while ((bio = bio_list_pop(&bios)))
911  retry_on_resume(bio);
912 }
913 
914 static void process_discard(struct thin_c *tc, struct bio *bio)
915 {
916  int r;
917  unsigned long flags;
918  struct pool *pool = tc->pool;
919  struct dm_bio_prison_cell *cell, *cell2;
920  struct dm_cell_key key, key2;
921  dm_block_t block = get_bio_block(tc, bio);
922  struct dm_thin_lookup_result lookup_result;
923  struct dm_thin_new_mapping *m;
924 
925  build_virtual_key(tc->td, block, &key);
926  if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
927  return;
928 
929  r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
930  switch (r) {
931  case 0:
932  /*
933  * Check nobody is fiddling with this pool block. This can
934  * happen if someone's in the process of breaking sharing
935  * on this block.
936  */
937  build_data_key(tc->td, lookup_result.block, &key2);
938  if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
939  dm_cell_release_singleton(cell, bio);
940  break;
941  }
942 
943  if (io_overlaps_block(pool, bio)) {
944  /*
945  * IO may still be going to the destination block. We must
946  * quiesce before we can do the removal.
947  */
948  m = get_next_mapping(pool);
949  m->tc = tc;
950  m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
951  m->virt_block = block;
952  m->data_block = lookup_result.block;
953  m->cell = cell;
954  m->cell2 = cell2;
955  m->err = 0;
956  m->bio = bio;
957 
958  if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
959  spin_lock_irqsave(&pool->lock, flags);
960  list_add(&m->list, &pool->prepared_discards);
961  spin_unlock_irqrestore(&pool->lock, flags);
962  wake_worker(pool);
963  }
964  } else {
965  /*
966  * The DM core makes sure that the discard doesn't span
967  * a block boundary. So we submit the discard of a
968  * partial block appropriately.
969  */
970  dm_cell_release_singleton(cell, bio);
971  dm_cell_release_singleton(cell2, bio);
972  if ((!lookup_result.shared) && pool->pf.discard_passdown)
973  remap_and_issue(tc, bio, lookup_result.block);
974  else
975  bio_endio(bio, 0);
976  }
977  break;
978 
979  case -ENODATA:
980  /*
981  * It isn't provisioned, just forget it.
982  */
983  dm_cell_release_singleton(cell, bio);
984  bio_endio(bio, 0);
985  break;
986 
987  default:
988  DMERR("discard: find block unexpectedly returned %d", r);
989  dm_cell_release_singleton(cell, bio);
990  bio_io_error(bio);
991  break;
992  }
993 }
994 
995 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
996  struct dm_cell_key *key,
997  struct dm_thin_lookup_result *lookup_result,
998  struct dm_bio_prison_cell *cell)
999 {
1000  int r;
1002 
1003  r = alloc_data_block(tc, &data_block);
1004  switch (r) {
1005  case 0:
1006  schedule_internal_copy(tc, block, lookup_result->block,
1007  data_block, cell, bio);
1008  break;
1009 
1010  case -ENOSPC:
1011  no_space(cell);
1012  break;
1013 
1014  default:
1015  DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1016  dm_cell_error(cell);
1017  break;
1018  }
1019 }
1020 
1021 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1022  dm_block_t block,
1023  struct dm_thin_lookup_result *lookup_result)
1024 {
1025  struct dm_bio_prison_cell *cell;
1026  struct pool *pool = tc->pool;
1027  struct dm_cell_key key;
1028 
1029  /*
1030  * If cell is already occupied, then sharing is already in the process
1031  * of being broken so we have nothing further to do here.
1032  */
1033  build_data_key(tc->td, lookup_result->block, &key);
1034  if (dm_bio_detain(pool->prison, &key, bio, &cell))
1035  return;
1036 
1037  if (bio_data_dir(bio) == WRITE && bio->bi_size)
1038  break_sharing(tc, bio, block, &key, lookup_result, cell);
1039  else {
1040  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1041 
1043 
1044  dm_cell_release_singleton(cell, bio);
1045  remap_and_issue(tc, bio, lookup_result->block);
1046  }
1047 }
1048 
1049 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1050  struct dm_bio_prison_cell *cell)
1051 {
1052  int r;
1053  dm_block_t data_block;
1054 
1055  /*
1056  * Remap empty bios (flushes) immediately, without provisioning.
1057  */
1058  if (!bio->bi_size) {
1059  dm_cell_release_singleton(cell, bio);
1060  remap_and_issue(tc, bio, 0);
1061  return;
1062  }
1063 
1064  /*
1065  * Fill read bios with zeroes and complete them immediately.
1066  */
1067  if (bio_data_dir(bio) == READ) {
1068  zero_fill_bio(bio);
1069  dm_cell_release_singleton(cell, bio);
1070  bio_endio(bio, 0);
1071  return;
1072  }
1073 
1074  r = alloc_data_block(tc, &data_block);
1075  switch (r) {
1076  case 0:
1077  if (tc->origin_dev)
1078  schedule_external_copy(tc, block, data_block, cell, bio);
1079  else
1080  schedule_zero(tc, block, data_block, cell, bio);
1081  break;
1082 
1083  case -ENOSPC:
1084  no_space(cell);
1085  break;
1086 
1087  default:
1088  DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1089  set_pool_mode(tc->pool, PM_READ_ONLY);
1090  dm_cell_error(cell);
1091  break;
1092  }
1093 }
1094 
1095 static void process_bio(struct thin_c *tc, struct bio *bio)
1096 {
1097  int r;
1098  dm_block_t block = get_bio_block(tc, bio);
1099  struct dm_bio_prison_cell *cell;
1100  struct dm_cell_key key;
1101  struct dm_thin_lookup_result lookup_result;
1102 
1103  /*
1104  * If cell is already occupied, then the block is already
1105  * being provisioned so we have nothing further to do here.
1106  */
1107  build_virtual_key(tc->td, block, &key);
1108  if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
1109  return;
1110 
1111  r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1112  switch (r) {
1113  case 0:
1114  /*
1115  * We can release this cell now. This thread is the only
1116  * one that puts bios into a cell, and we know there were
1117  * no preceding bios.
1118  */
1119  /*
1120  * TODO: this will probably have to change when discard goes
1121  * back in.
1122  */
1123  dm_cell_release_singleton(cell, bio);
1124 
1125  if (lookup_result.shared)
1126  process_shared_bio(tc, bio, block, &lookup_result);
1127  else
1128  remap_and_issue(tc, bio, lookup_result.block);
1129  break;
1130 
1131  case -ENODATA:
1132  if (bio_data_dir(bio) == READ && tc->origin_dev) {
1133  dm_cell_release_singleton(cell, bio);
1134  remap_to_origin_and_issue(tc, bio);
1135  } else
1136  provision_block(tc, bio, block, cell);
1137  break;
1138 
1139  default:
1140  DMERR("dm_thin_find_block() failed, error = %d", r);
1141  dm_cell_release_singleton(cell, bio);
1142  bio_io_error(bio);
1143  break;
1144  }
1145 }
1146 
1147 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1148 {
1149  int r;
1150  int rw = bio_data_dir(bio);
1151  dm_block_t block = get_bio_block(tc, bio);
1152  struct dm_thin_lookup_result lookup_result;
1153 
1154  r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1155  switch (r) {
1156  case 0:
1157  if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1158  bio_io_error(bio);
1159  else
1160  remap_and_issue(tc, bio, lookup_result.block);
1161  break;
1162 
1163  case -ENODATA:
1164  if (rw != READ) {
1165  bio_io_error(bio);
1166  break;
1167  }
1168 
1169  if (tc->origin_dev) {
1170  remap_to_origin_and_issue(tc, bio);
1171  break;
1172  }
1173 
1174  zero_fill_bio(bio);
1175  bio_endio(bio, 0);
1176  break;
1177 
1178  default:
1179  DMERR("dm_thin_find_block() failed, error = %d", r);
1180  bio_io_error(bio);
1181  break;
1182  }
1183 }
1184 
1185 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1186 {
1187  bio_io_error(bio);
1188 }
1189 
1190 static int need_commit_due_to_time(struct pool *pool)
1191 {
1192  return jiffies < pool->last_commit_jiffies ||
1194 }
1195 
1196 static void process_deferred_bios(struct pool *pool)
1197 {
1198  unsigned long flags;
1199  struct bio *bio;
1200  struct bio_list bios;
1201 
1202  bio_list_init(&bios);
1203 
1204  spin_lock_irqsave(&pool->lock, flags);
1205  bio_list_merge(&bios, &pool->deferred_bios);
1206  bio_list_init(&pool->deferred_bios);
1207  spin_unlock_irqrestore(&pool->lock, flags);
1208 
1209  while ((bio = bio_list_pop(&bios))) {
1210  struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1211  struct thin_c *tc = h->tc;
1212 
1213  /*
1214  * If we've got no free new_mapping structs, and processing
1215  * this bio might require one, we pause until there are some
1216  * prepared mappings to process.
1217  */
1218  if (ensure_next_mapping(pool)) {
1219  spin_lock_irqsave(&pool->lock, flags);
1220  bio_list_merge(&pool->deferred_bios, &bios);
1221  spin_unlock_irqrestore(&pool->lock, flags);
1222 
1223  break;
1224  }
1225 
1226  if (bio->bi_rw & REQ_DISCARD)
1227  pool->process_discard(tc, bio);
1228  else
1229  pool->process_bio(tc, bio);
1230  }
1231 
1232  /*
1233  * If there are any deferred flush bios, we must commit
1234  * the metadata before issuing them.
1235  */
1236  bio_list_init(&bios);
1237  spin_lock_irqsave(&pool->lock, flags);
1238  bio_list_merge(&bios, &pool->deferred_flush_bios);
1239  bio_list_init(&pool->deferred_flush_bios);
1240  spin_unlock_irqrestore(&pool->lock, flags);
1241 
1242  if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1243  return;
1244 
1245  if (commit_or_fallback(pool)) {
1246  while ((bio = bio_list_pop(&bios)))
1247  bio_io_error(bio);
1248  return;
1249  }
1250  pool->last_commit_jiffies = jiffies;
1251 
1252  while ((bio = bio_list_pop(&bios)))
1253  generic_make_request(bio);
1254 }
1255 
1256 static void do_worker(struct work_struct *ws)
1257 {
1258  struct pool *pool = container_of(ws, struct pool, worker);
1259 
1260  process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1261  process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1262  process_deferred_bios(pool);
1263 }
1264 
1265 /*
1266  * We want to commit periodically so that not too much
1267  * unwritten data builds up.
1268  */
1269 static void do_waker(struct work_struct *ws)
1270 {
1271  struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1272  wake_worker(pool);
1273  queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1274 }
1275 
1276 /*----------------------------------------------------------------*/
1277 
1278 static enum pool_mode get_pool_mode(struct pool *pool)
1279 {
1280  return pool->pf.mode;
1281 }
1282 
1283 static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1284 {
1285  int r;
1286 
1287  pool->pf.mode = mode;
1288 
1289  switch (mode) {
1290  case PM_FAIL:
1291  DMERR("switching pool to failure mode");
1292  pool->process_bio = process_bio_fail;
1293  pool->process_discard = process_bio_fail;
1294  pool->process_prepared_mapping = process_prepared_mapping_fail;
1295  pool->process_prepared_discard = process_prepared_discard_fail;
1296  break;
1297 
1298  case PM_READ_ONLY:
1299  DMERR("switching pool to read-only mode");
1300  r = dm_pool_abort_metadata(pool->pmd);
1301  if (r) {
1302  DMERR("aborting transaction failed");
1303  set_pool_mode(pool, PM_FAIL);
1304  } else {
1306  pool->process_bio = process_bio_read_only;
1308  pool->process_prepared_mapping = process_prepared_mapping_fail;
1309  pool->process_prepared_discard = process_prepared_discard_passdown;
1310  }
1311  break;
1312 
1313  case PM_WRITE:
1314  pool->process_bio = process_bio;
1318  break;
1319  }
1320 }
1321 
1322 /*----------------------------------------------------------------*/
1323 
1324 /*
1325  * Mapping functions.
1326  */
1327 
1328 /*
1329  * Called only while mapping a thin bio to hand it over to the workqueue.
1330  */
1331 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1332 {
1333  unsigned long flags;
1334  struct pool *pool = tc->pool;
1335 
1336  spin_lock_irqsave(&pool->lock, flags);
1337  bio_list_add(&pool->deferred_bios, bio);
1338  spin_unlock_irqrestore(&pool->lock, flags);
1339 
1340  wake_worker(pool);
1341 }
1342 
1343 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1344 {
1345  struct pool *pool = tc->pool;
1347 
1348  h->tc = tc;
1349  h->shared_read_entry = NULL;
1350  h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
1351  h->overwrite_mapping = NULL;
1352 
1353  return h;
1354 }
1355 
1356 /*
1357  * Non-blocking function called from the thin target's map function.
1358  */
1359 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1360  union map_info *map_context)
1361 {
1362  int r;
1363  struct thin_c *tc = ti->private;
1364  dm_block_t block = get_bio_block(tc, bio);
1365  struct dm_thin_device *td = tc->td;
1366  struct dm_thin_lookup_result result;
1367 
1368  map_context->ptr = thin_hook_bio(tc, bio);
1369 
1370  if (get_pool_mode(tc->pool) == PM_FAIL) {
1371  bio_io_error(bio);
1372  return DM_MAPIO_SUBMITTED;
1373  }
1374 
1375  if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1376  thin_defer_bio(tc, bio);
1377  return DM_MAPIO_SUBMITTED;
1378  }
1379 
1380  r = dm_thin_find_block(td, block, 0, &result);
1381 
1382  /*
1383  * Note that we defer readahead too.
1384  */
1385  switch (r) {
1386  case 0:
1387  if (unlikely(result.shared)) {
1388  /*
1389  * We have a race condition here between the
1390  * result.shared value returned by the lookup and
1391  * snapshot creation, which may cause new
1392  * sharing.
1393  *
1394  * To avoid this always quiesce the origin before
1395  * taking the snap. You want to do this anyway to
1396  * ensure a consistent application view
1397  * (i.e. lockfs).
1398  *
1399  * More distant ancestors are irrelevant. The
1400  * shared flag will be set in their case.
1401  */
1402  thin_defer_bio(tc, bio);
1403  r = DM_MAPIO_SUBMITTED;
1404  } else {
1405  remap(tc, bio, result.block);
1406  r = DM_MAPIO_REMAPPED;
1407  }
1408  break;
1409 
1410  case -ENODATA:
1411  if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1412  /*
1413  * This block isn't provisioned, and we have no way
1414  * of doing so. Just error it.
1415  */
1416  bio_io_error(bio);
1417  r = DM_MAPIO_SUBMITTED;
1418  break;
1419  }
1420  /* fall through */
1421 
1422  case -EWOULDBLOCK:
1423  /*
1424  * In future, the failed dm_thin_find_block above could
1425  * provide the hint to load the metadata into cache.
1426  */
1427  thin_defer_bio(tc, bio);
1428  r = DM_MAPIO_SUBMITTED;
1429  break;
1430 
1431  default:
1432  /*
1433  * Must always call bio_io_error on failure.
1434  * dm_thin_find_block can fail with -EINVAL if the
1435  * pool is switched to fail-io mode.
1436  */
1437  bio_io_error(bio);
1438  r = DM_MAPIO_SUBMITTED;
1439  break;
1440  }
1441 
1442  return r;
1443 }
1444 
1445 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1446 {
1447  int r;
1448  unsigned long flags;
1449  struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1450 
1451  spin_lock_irqsave(&pt->pool->lock, flags);
1452  r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1453  spin_unlock_irqrestore(&pt->pool->lock, flags);
1454 
1455  if (!r) {
1456  struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1457  r = bdi_congested(&q->backing_dev_info, bdi_bits);
1458  }
1459 
1460  return r;
1461 }
1462 
1463 static void __requeue_bios(struct pool *pool)
1464 {
1465  bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1466  bio_list_init(&pool->retry_on_resume_list);
1467 }
1468 
1469 /*----------------------------------------------------------------
1470  * Binding of control targets to a pool object
1471  *--------------------------------------------------------------*/
1472 static bool data_dev_supports_discard(struct pool_c *pt)
1473 {
1474  struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1475 
1476  return q && blk_queue_discard(q);
1477 }
1478 
1479 /*
1480  * If discard_passdown was enabled verify that the data device
1481  * supports discards. Disable discard_passdown if not.
1482  */
1483 static void disable_passdown_if_not_supported(struct pool_c *pt)
1484 {
1485  struct pool *pool = pt->pool;
1486  struct block_device *data_bdev = pt->data_dev->bdev;
1487  struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1489  const char *reason = NULL;
1490  char buf[BDEVNAME_SIZE];
1491 
1492  if (!pt->adjusted_pf.discard_passdown)
1493  return;
1494 
1495  if (!data_dev_supports_discard(pt))
1496  reason = "discard unsupported";
1497 
1498  else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1499  reason = "max discard sectors smaller than a block";
1500 
1501  else if (data_limits->discard_granularity > block_size)
1502  reason = "discard granularity larger than a block";
1503 
1504  else if (block_size & (data_limits->discard_granularity - 1))
1505  reason = "discard granularity not a factor of block size";
1506 
1507  if (reason) {
1508  DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1509  pt->adjusted_pf.discard_passdown = false;
1510  }
1511 }
1512 
1513 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1514 {
1515  struct pool_c *pt = ti->private;
1516 
1517  /*
1518  * We want to make sure that degraded pools are never upgraded.
1519  */
1520  enum pool_mode old_mode = pool->pf.mode;
1521  enum pool_mode new_mode = pt->adjusted_pf.mode;
1522 
1523  if (old_mode > new_mode)
1524  new_mode = old_mode;
1525 
1526  pool->ti = ti;
1527  pool->low_water_blocks = pt->low_water_blocks;
1528  pool->pf = pt->adjusted_pf;
1529 
1530  set_pool_mode(pool, new_mode);
1531 
1532  return 0;
1533 }
1534 
1535 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1536 {
1537  if (pool->ti == ti)
1538  pool->ti = NULL;
1539 }
1540 
1541 /*----------------------------------------------------------------
1542  * Pool creation
1543  *--------------------------------------------------------------*/
1544 /* Initialize pool features. */
1545 static void pool_features_init(struct pool_features *pf)
1546 {
1547  pf->mode = PM_WRITE;
1548  pf->zero_new_blocks = true;
1549  pf->discard_enabled = true;
1550  pf->discard_passdown = true;
1551 }
1552 
1553 static void __pool_destroy(struct pool *pool)
1554 {
1555  __pool_table_remove(pool);
1556 
1557  if (dm_pool_metadata_close(pool->pmd) < 0)
1558  DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1559 
1562 
1563  if (pool->wq)
1564  destroy_workqueue(pool->wq);
1565 
1566  if (pool->next_mapping)
1567  mempool_free(pool->next_mapping, pool->mapping_pool);
1572  kfree(pool);
1573 }
1574 
1575 static struct kmem_cache *_new_mapping_cache;
1576 static struct kmem_cache *_endio_hook_cache;
1577 
1578 static struct pool *pool_create(struct mapped_device *pool_md,
1579  struct block_device *metadata_dev,
1580  unsigned long block_size,
1581  int read_only, char **error)
1582 {
1583  int r;
1584  void *err_p;
1585  struct pool *pool;
1586  struct dm_pool_metadata *pmd;
1587  bool format_device = read_only ? false : true;
1588 
1589  pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1590  if (IS_ERR(pmd)) {
1591  *error = "Error creating metadata object";
1592  return (struct pool *)pmd;
1593  }
1594 
1595  pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1596  if (!pool) {
1597  *error = "Error allocating memory for pool";
1598  err_p = ERR_PTR(-ENOMEM);
1599  goto bad_pool;
1600  }
1601 
1602  pool->pmd = pmd;
1603  pool->sectors_per_block = block_size;
1604  if (block_size & (block_size - 1))
1605  pool->sectors_per_block_shift = -1;
1606  else
1607  pool->sectors_per_block_shift = __ffs(block_size);
1608  pool->low_water_blocks = 0;
1609  pool_features_init(&pool->pf);
1611  if (!pool->prison) {
1612  *error = "Error creating pool's bio prison";
1613  err_p = ERR_PTR(-ENOMEM);
1614  goto bad_prison;
1615  }
1616 
1617  pool->copier = dm_kcopyd_client_create();
1618  if (IS_ERR(pool->copier)) {
1619  r = PTR_ERR(pool->copier);
1620  *error = "Error creating pool's kcopyd client";
1621  err_p = ERR_PTR(r);
1622  goto bad_kcopyd_client;
1623  }
1624 
1625  /*
1626  * Create singlethreaded workqueue that will service all devices
1627  * that use this metadata.
1628  */
1630  if (!pool->wq) {
1631  *error = "Error creating pool's workqueue";
1632  err_p = ERR_PTR(-ENOMEM);
1633  goto bad_wq;
1634  }
1635 
1636  INIT_WORK(&pool->worker, do_worker);
1637  INIT_DELAYED_WORK(&pool->waker, do_waker);
1638  spin_lock_init(&pool->lock);
1639  bio_list_init(&pool->deferred_bios);
1640  bio_list_init(&pool->deferred_flush_bios);
1641  INIT_LIST_HEAD(&pool->prepared_mappings);
1642  INIT_LIST_HEAD(&pool->prepared_discards);
1643  pool->low_water_triggered = 0;
1644  pool->no_free_space = 0;
1645  bio_list_init(&pool->retry_on_resume_list);
1646 
1648  if (!pool->shared_read_ds) {
1649  *error = "Error creating pool's shared read deferred set";
1650  err_p = ERR_PTR(-ENOMEM);
1651  goto bad_shared_read_ds;
1652  }
1653 
1655  if (!pool->all_io_ds) {
1656  *error = "Error creating pool's all io deferred set";
1657  err_p = ERR_PTR(-ENOMEM);
1658  goto bad_all_io_ds;
1659  }
1660 
1661  pool->next_mapping = NULL;
1662  pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1663  _new_mapping_cache);
1664  if (!pool->mapping_pool) {
1665  *error = "Error creating pool's mapping mempool";
1666  err_p = ERR_PTR(-ENOMEM);
1667  goto bad_mapping_pool;
1668  }
1669 
1670  pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
1671  _endio_hook_cache);
1672  if (!pool->endio_hook_pool) {
1673  *error = "Error creating pool's endio_hook mempool";
1674  err_p = ERR_PTR(-ENOMEM);
1675  goto bad_endio_hook_pool;
1676  }
1677  pool->ref_count = 1;
1678  pool->last_commit_jiffies = jiffies;
1679  pool->pool_md = pool_md;
1680  pool->md_dev = metadata_dev;
1681  __pool_table_insert(pool);
1682 
1683  return pool;
1684 
1685 bad_endio_hook_pool:
1687 bad_mapping_pool:
1689 bad_all_io_ds:
1691 bad_shared_read_ds:
1692  destroy_workqueue(pool->wq);
1693 bad_wq:
1695 bad_kcopyd_client:
1697 bad_prison:
1698  kfree(pool);
1699 bad_pool:
1700  if (dm_pool_metadata_close(pmd))
1701  DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1702 
1703  return err_p;
1704 }
1705 
1706 static void __pool_inc(struct pool *pool)
1707 {
1708  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1709  pool->ref_count++;
1710 }
1711 
1712 static void __pool_dec(struct pool *pool)
1713 {
1714  BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1715  BUG_ON(!pool->ref_count);
1716  if (!--pool->ref_count)
1717  __pool_destroy(pool);
1718 }
1719 
1720 static struct pool *__pool_find(struct mapped_device *pool_md,
1721  struct block_device *metadata_dev,
1722  unsigned long block_size, int read_only,
1723  char **error, int *created)
1724 {
1725  struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1726 
1727  if (pool) {
1728  if (pool->pool_md != pool_md) {
1729  *error = "metadata device already in use by a pool";
1730  return ERR_PTR(-EBUSY);
1731  }
1732  __pool_inc(pool);
1733 
1734  } else {
1735  pool = __pool_table_lookup(pool_md);
1736  if (pool) {
1737  if (pool->md_dev != metadata_dev) {
1738  *error = "different pool cannot replace a pool";
1739  return ERR_PTR(-EINVAL);
1740  }
1741  __pool_inc(pool);
1742 
1743  } else {
1744  pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1745  *created = 1;
1746  }
1747  }
1748 
1749  return pool;
1750 }
1751 
1752 /*----------------------------------------------------------------
1753  * Pool target methods
1754  *--------------------------------------------------------------*/
1755 static void pool_dtr(struct dm_target *ti)
1756 {
1757  struct pool_c *pt = ti->private;
1758 
1759  mutex_lock(&dm_thin_pool_table.mutex);
1760 
1761  unbind_control_target(pt->pool, ti);
1762  __pool_dec(pt->pool);
1763  dm_put_device(ti, pt->metadata_dev);
1764  dm_put_device(ti, pt->data_dev);
1765  kfree(pt);
1766 
1767  mutex_unlock(&dm_thin_pool_table.mutex);
1768 }
1769 
1770 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1771  struct dm_target *ti)
1772 {
1773  int r;
1774  unsigned argc;
1775  const char *arg_name;
1776 
1777  static struct dm_arg _args[] = {
1778  {0, 3, "Invalid number of pool feature arguments"},
1779  };
1780 
1781  /*
1782  * No feature arguments supplied.
1783  */
1784  if (!as->argc)
1785  return 0;
1786 
1787  r = dm_read_arg_group(_args, as, &argc, &ti->error);
1788  if (r)
1789  return -EINVAL;
1790 
1791  while (argc && !r) {
1792  arg_name = dm_shift_arg(as);
1793  argc--;
1794 
1795  if (!strcasecmp(arg_name, "skip_block_zeroing"))
1796  pf->zero_new_blocks = false;
1797 
1798  else if (!strcasecmp(arg_name, "ignore_discard"))
1799  pf->discard_enabled = false;
1800 
1801  else if (!strcasecmp(arg_name, "no_discard_passdown"))
1802  pf->discard_passdown = false;
1803 
1804  else if (!strcasecmp(arg_name, "read_only"))
1805  pf->mode = PM_READ_ONLY;
1806 
1807  else {
1808  ti->error = "Unrecognised pool feature requested";
1809  r = -EINVAL;
1810  break;
1811  }
1812  }
1813 
1814  return r;
1815 }
1816 
1817 /*
1818  * thin-pool <metadata dev> <data dev>
1819  * <data block size (sectors)>
1820  * <low water mark (blocks)>
1821  * [<#feature args> [<arg>]*]
1822  *
1823  * Optional feature arguments are:
1824  * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1825  * ignore_discard: disable discard
1826  * no_discard_passdown: don't pass discards down to the data device
1827  */
1828 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1829 {
1830  int r, pool_created = 0;
1831  struct pool_c *pt;
1832  struct pool *pool;
1833  struct pool_features pf;
1834  struct dm_arg_set as;
1835  struct dm_dev *data_dev;
1836  unsigned long block_size;
1837  dm_block_t low_water_blocks;
1838  struct dm_dev *metadata_dev;
1839  sector_t metadata_dev_size;
1840  char b[BDEVNAME_SIZE];
1841 
1842  /*
1843  * FIXME Remove validation from scope of lock.
1844  */
1845  mutex_lock(&dm_thin_pool_table.mutex);
1846 
1847  if (argc < 4) {
1848  ti->error = "Invalid argument count";
1849  r = -EINVAL;
1850  goto out_unlock;
1851  }
1852  as.argc = argc;
1853  as.argv = argv;
1854 
1855  r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1856  if (r) {
1857  ti->error = "Error opening metadata block device";
1858  goto out_unlock;
1859  }
1860 
1861  metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1862  if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1863  DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1864  bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1865 
1866  r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1867  if (r) {
1868  ti->error = "Error getting data device";
1869  goto out_metadata;
1870  }
1871 
1872  if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1873  block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1874  block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1875  block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1876  ti->error = "Invalid block size";
1877  r = -EINVAL;
1878  goto out;
1879  }
1880 
1881  if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1882  ti->error = "Invalid low water mark";
1883  r = -EINVAL;
1884  goto out;
1885  }
1886 
1887  /*
1888  * Set default pool features.
1889  */
1890  pool_features_init(&pf);
1891 
1892  dm_consume_args(&as, 4);
1893  r = parse_pool_features(&as, &pf, ti);
1894  if (r)
1895  goto out;
1896 
1897  pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1898  if (!pt) {
1899  r = -ENOMEM;
1900  goto out;
1901  }
1902 
1903  pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1904  block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1905  if (IS_ERR(pool)) {
1906  r = PTR_ERR(pool);
1907  goto out_free_pt;
1908  }
1909 
1910  /*
1911  * 'pool_created' reflects whether this is the first table load.
1912  * Top level discard support is not allowed to be changed after
1913  * initial load. This would require a pool reload to trigger thin
1914  * device changes.
1915  */
1916  if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1917  ti->error = "Discard support cannot be disabled once enabled";
1918  r = -EINVAL;
1919  goto out_flags_changed;
1920  }
1921 
1922  pt->pool = pool;
1923  pt->ti = ti;
1924  pt->metadata_dev = metadata_dev;
1925  pt->data_dev = data_dev;
1926  pt->low_water_blocks = low_water_blocks;
1927  pt->adjusted_pf = pt->requested_pf = pf;
1928  ti->num_flush_requests = 1;
1929 
1930  /*
1931  * Only need to enable discards if the pool should pass
1932  * them down to the data device. The thin device's discard
1933  * processing will cause mappings to be removed from the btree.
1934  */
1935  if (pf.discard_enabled && pf.discard_passdown) {
1936  ti->num_discard_requests = 1;
1937 
1938  /*
1939  * Setting 'discards_supported' circumvents the normal
1940  * stacking of discard limits (this keeps the pool and
1941  * thin devices' discard limits consistent).
1942  */
1943  ti->discards_supported = true;
1945  }
1946  ti->private = pt;
1947 
1948  pt->callbacks.congested_fn = pool_is_congested;
1950 
1951  mutex_unlock(&dm_thin_pool_table.mutex);
1952 
1953  return 0;
1954 
1955 out_flags_changed:
1956  __pool_dec(pool);
1957 out_free_pt:
1958  kfree(pt);
1959 out:
1960  dm_put_device(ti, data_dev);
1961 out_metadata:
1962  dm_put_device(ti, metadata_dev);
1963 out_unlock:
1964  mutex_unlock(&dm_thin_pool_table.mutex);
1965 
1966  return r;
1967 }
1968 
1969 static int pool_map(struct dm_target *ti, struct bio *bio,
1970  union map_info *map_context)
1971 {
1972  int r;
1973  struct pool_c *pt = ti->private;
1974  struct pool *pool = pt->pool;
1975  unsigned long flags;
1976 
1977  /*
1978  * As this is a singleton target, ti->begin is always zero.
1979  */
1980  spin_lock_irqsave(&pool->lock, flags);
1981  bio->bi_bdev = pt->data_dev->bdev;
1982  r = DM_MAPIO_REMAPPED;
1983  spin_unlock_irqrestore(&pool->lock, flags);
1984 
1985  return r;
1986 }
1987 
1988 /*
1989  * Retrieves the number of blocks of the data device from
1990  * the superblock and compares it to the actual device size,
1991  * thus resizing the data device in case it has grown.
1992  *
1993  * This both copes with opening preallocated data devices in the ctr
1994  * being followed by a resume
1995  * -and-
1996  * calling the resume method individually after userspace has
1997  * grown the data device in reaction to a table event.
1998  */
1999 static int pool_preresume(struct dm_target *ti)
2000 {
2001  int r;
2002  struct pool_c *pt = ti->private;
2003  struct pool *pool = pt->pool;
2004  sector_t data_size = ti->len;
2005  dm_block_t sb_data_size;
2006 
2007  /*
2008  * Take control of the pool object.
2009  */
2010  r = bind_control_target(pool, ti);
2011  if (r)
2012  return r;
2013 
2014  (void) sector_div(data_size, pool->sectors_per_block);
2015 
2016  r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2017  if (r) {
2018  DMERR("failed to retrieve data device size");
2019  return r;
2020  }
2021 
2022  if (data_size < sb_data_size) {
2023  DMERR("pool target too small, is %llu blocks (expected %llu)",
2024  (unsigned long long)data_size, sb_data_size);
2025  return -EINVAL;
2026 
2027  } else if (data_size > sb_data_size) {
2028  r = dm_pool_resize_data_dev(pool->pmd, data_size);
2029  if (r) {
2030  DMERR("failed to resize data device");
2031  /* FIXME Stricter than necessary: Rollback transaction instead here */
2032  set_pool_mode(pool, PM_READ_ONLY);
2033  return r;
2034  }
2035 
2036  (void) commit_or_fallback(pool);
2037  }
2038 
2039  return 0;
2040 }
2041 
2042 static void pool_resume(struct dm_target *ti)
2043 {
2044  struct pool_c *pt = ti->private;
2045  struct pool *pool = pt->pool;
2046  unsigned long flags;
2047 
2048  spin_lock_irqsave(&pool->lock, flags);
2049  pool->low_water_triggered = 0;
2050  pool->no_free_space = 0;
2051  __requeue_bios(pool);
2052  spin_unlock_irqrestore(&pool->lock, flags);
2053 
2054  do_waker(&pool->waker.work);
2055 }
2056 
2057 static void pool_postsuspend(struct dm_target *ti)
2058 {
2059  struct pool_c *pt = ti->private;
2060  struct pool *pool = pt->pool;
2061 
2062  cancel_delayed_work(&pool->waker);
2063  flush_workqueue(pool->wq);
2064  (void) commit_or_fallback(pool);
2065 }
2066 
2067 static int check_arg_count(unsigned argc, unsigned args_required)
2068 {
2069  if (argc != args_required) {
2070  DMWARN("Message received with %u arguments instead of %u.",
2071  argc, args_required);
2072  return -EINVAL;
2073  }
2074 
2075  return 0;
2076 }
2077 
2078 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2079 {
2080  if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2081  *dev_id <= MAX_DEV_ID)
2082  return 0;
2083 
2084  if (warning)
2085  DMWARN("Message received with invalid device id: %s", arg);
2086 
2087  return -EINVAL;
2088 }
2089 
2090 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2091 {
2093  int r;
2094 
2095  r = check_arg_count(argc, 2);
2096  if (r)
2097  return r;
2098 
2099  r = read_dev_id(argv[1], &dev_id, 1);
2100  if (r)
2101  return r;
2102 
2103  r = dm_pool_create_thin(pool->pmd, dev_id);
2104  if (r) {
2105  DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2106  argv[1]);
2107  return r;
2108  }
2109 
2110  return 0;
2111 }
2112 
2113 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2114 {
2116  dm_thin_id origin_dev_id;
2117  int r;
2118 
2119  r = check_arg_count(argc, 3);
2120  if (r)
2121  return r;
2122 
2123  r = read_dev_id(argv[1], &dev_id, 1);
2124  if (r)
2125  return r;
2126 
2127  r = read_dev_id(argv[2], &origin_dev_id, 1);
2128  if (r)
2129  return r;
2130 
2131  r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2132  if (r) {
2133  DMWARN("Creation of new snapshot %s of device %s failed.",
2134  argv[1], argv[2]);
2135  return r;
2136  }
2137 
2138  return 0;
2139 }
2140 
2141 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2142 {
2144  int r;
2145 
2146  r = check_arg_count(argc, 2);
2147  if (r)
2148  return r;
2149 
2150  r = read_dev_id(argv[1], &dev_id, 1);
2151  if (r)
2152  return r;
2153 
2154  r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2155  if (r)
2156  DMWARN("Deletion of thin device %s failed.", argv[1]);
2157 
2158  return r;
2159 }
2160 
2161 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2162 {
2163  dm_thin_id old_id, new_id;
2164  int r;
2165 
2166  r = check_arg_count(argc, 3);
2167  if (r)
2168  return r;
2169 
2170  if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2171  DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2172  return -EINVAL;
2173  }
2174 
2175  if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2176  DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2177  return -EINVAL;
2178  }
2179 
2180  r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2181  if (r) {
2182  DMWARN("Failed to change transaction id from %s to %s.",
2183  argv[1], argv[2]);
2184  return r;
2185  }
2186 
2187  return 0;
2188 }
2189 
2190 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2191 {
2192  int r;
2193 
2194  r = check_arg_count(argc, 1);
2195  if (r)
2196  return r;
2197 
2198  (void) commit_or_fallback(pool);
2199 
2201  if (r)
2202  DMWARN("reserve_metadata_snap message failed.");
2203 
2204  return r;
2205 }
2206 
2207 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2208 {
2209  int r;
2210 
2211  r = check_arg_count(argc, 1);
2212  if (r)
2213  return r;
2214 
2216  if (r)
2217  DMWARN("release_metadata_snap message failed.");
2218 
2219  return r;
2220 }
2221 
2222 /*
2223  * Messages supported:
2224  * create_thin <dev_id>
2225  * create_snap <dev_id> <origin_id>
2226  * delete <dev_id>
2227  * trim <dev_id> <new_size_in_sectors>
2228  * set_transaction_id <current_trans_id> <new_trans_id>
2229  * reserve_metadata_snap
2230  * release_metadata_snap
2231  */
2232 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2233 {
2234  int r = -EINVAL;
2235  struct pool_c *pt = ti->private;
2236  struct pool *pool = pt->pool;
2237 
2238  if (!strcasecmp(argv[0], "create_thin"))
2239  r = process_create_thin_mesg(argc, argv, pool);
2240 
2241  else if (!strcasecmp(argv[0], "create_snap"))
2242  r = process_create_snap_mesg(argc, argv, pool);
2243 
2244  else if (!strcasecmp(argv[0], "delete"))
2245  r = process_delete_mesg(argc, argv, pool);
2246 
2247  else if (!strcasecmp(argv[0], "set_transaction_id"))
2248  r = process_set_transaction_id_mesg(argc, argv, pool);
2249 
2250  else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2251  r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2252 
2253  else if (!strcasecmp(argv[0], "release_metadata_snap"))
2254  r = process_release_metadata_snap_mesg(argc, argv, pool);
2255 
2256  else
2257  DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2258 
2259  if (!r)
2260  (void) commit_or_fallback(pool);
2261 
2262  return r;
2263 }
2264 
2265 static void emit_flags(struct pool_features *pf, char *result,
2266  unsigned sz, unsigned maxlen)
2267 {
2268  unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2269  !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2270  DMEMIT("%u ", count);
2271 
2272  if (!pf->zero_new_blocks)
2273  DMEMIT("skip_block_zeroing ");
2274 
2275  if (!pf->discard_enabled)
2276  DMEMIT("ignore_discard ");
2277 
2278  if (!pf->discard_passdown)
2279  DMEMIT("no_discard_passdown ");
2280 
2281  if (pf->mode == PM_READ_ONLY)
2282  DMEMIT("read_only ");
2283 }
2284 
2285 /*
2286  * Status line is:
2287  * <transaction id> <used metadata sectors>/<total metadata sectors>
2288  * <used data sectors>/<total data sectors> <held metadata root>
2289  */
2290 static int pool_status(struct dm_target *ti, status_type_t type,
2291  unsigned status_flags, char *result, unsigned maxlen)
2292 {
2293  int r;
2294  unsigned sz = 0;
2295  uint64_t transaction_id;
2296  dm_block_t nr_free_blocks_data;
2297  dm_block_t nr_free_blocks_metadata;
2298  dm_block_t nr_blocks_data;
2299  dm_block_t nr_blocks_metadata;
2300  dm_block_t held_root;
2301  char buf[BDEVNAME_SIZE];
2302  char buf2[BDEVNAME_SIZE];
2303  struct pool_c *pt = ti->private;
2304  struct pool *pool = pt->pool;
2305 
2306  switch (type) {
2307  case STATUSTYPE_INFO:
2308  if (get_pool_mode(pool) == PM_FAIL) {
2309  DMEMIT("Fail");
2310  break;
2311  }
2312 
2313  /* Commit to ensure statistics aren't out-of-date */
2314  if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2315  (void) commit_or_fallback(pool);
2316 
2318  &transaction_id);
2319  if (r)
2320  return r;
2321 
2323  &nr_free_blocks_metadata);
2324  if (r)
2325  return r;
2326 
2327  r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2328  if (r)
2329  return r;
2330 
2332  &nr_free_blocks_data);
2333  if (r)
2334  return r;
2335 
2336  r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2337  if (r)
2338  return r;
2339 
2340  r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2341  if (r)
2342  return r;
2343 
2344  DMEMIT("%llu %llu/%llu %llu/%llu ",
2345  (unsigned long long)transaction_id,
2346  (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2347  (unsigned long long)nr_blocks_metadata,
2348  (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2349  (unsigned long long)nr_blocks_data);
2350 
2351  if (held_root)
2352  DMEMIT("%llu ", held_root);
2353  else
2354  DMEMIT("- ");
2355 
2356  if (pool->pf.mode == PM_READ_ONLY)
2357  DMEMIT("ro ");
2358  else
2359  DMEMIT("rw ");
2360 
2361  if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2362  DMEMIT("discard_passdown");
2363  else
2364  DMEMIT("no_discard_passdown");
2365 
2366  break;
2367 
2368  case STATUSTYPE_TABLE:
2369  DMEMIT("%s %s %lu %llu ",
2370  format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2371  format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2372  (unsigned long)pool->sectors_per_block,
2373  (unsigned long long)pt->low_water_blocks);
2374  emit_flags(&pt->requested_pf, result, sz, maxlen);
2375  break;
2376  }
2377 
2378  return 0;
2379 }
2380 
2381 static int pool_iterate_devices(struct dm_target *ti,
2383 {
2384  struct pool_c *pt = ti->private;
2385 
2386  return fn(ti, pt->data_dev, 0, ti->len, data);
2387 }
2388 
2389 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2390  struct bio_vec *biovec, int max_size)
2391 {
2392  struct pool_c *pt = ti->private;
2393  struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2394 
2395  if (!q->merge_bvec_fn)
2396  return max_size;
2397 
2398  bvm->bi_bdev = pt->data_dev->bdev;
2399 
2400  return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2401 }
2402 
2403 static bool block_size_is_power_of_two(struct pool *pool)
2404 {
2405  return pool->sectors_per_block_shift >= 0;
2406 }
2407 
2408 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2409 {
2410  struct pool *pool = pt->pool;
2411  struct queue_limits *data_limits;
2412 
2413  limits->max_discard_sectors = pool->sectors_per_block;
2414 
2415  /*
2416  * discard_granularity is just a hint, and not enforced.
2417  */
2418  if (pt->adjusted_pf.discard_passdown) {
2419  data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2420  limits->discard_granularity = data_limits->discard_granularity;
2421  } else if (block_size_is_power_of_two(pool))
2422  limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2423  else
2424  /*
2425  * Use largest power of 2 that is a factor of sectors_per_block
2426  * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2427  */
2428  limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2430 }
2431 
2432 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2433 {
2434  struct pool_c *pt = ti->private;
2435  struct pool *pool = pt->pool;
2436 
2437  blk_limits_io_min(limits, 0);
2439 
2440  /*
2441  * pt->adjusted_pf is a staging area for the actual features to use.
2442  * They get transferred to the live pool in bind_control_target()
2443  * called from pool_preresume().
2444  */
2445  if (!pt->adjusted_pf.discard_enabled)
2446  return;
2447 
2448  disable_passdown_if_not_supported(pt);
2449 
2450  set_discard_limits(pt, limits);
2451 }
2452 
2453 static struct target_type pool_target = {
2454  .name = "thin-pool",
2457  .version = {1, 5, 0},
2458  .module = THIS_MODULE,
2459  .ctr = pool_ctr,
2460  .dtr = pool_dtr,
2461  .map = pool_map,
2462  .postsuspend = pool_postsuspend,
2463  .preresume = pool_preresume,
2464  .resume = pool_resume,
2465  .message = pool_message,
2466  .status = pool_status,
2467  .merge = pool_merge,
2468  .iterate_devices = pool_iterate_devices,
2469  .io_hints = pool_io_hints,
2470 };
2471 
2472 /*----------------------------------------------------------------
2473  * Thin target methods
2474  *--------------------------------------------------------------*/
2475 static void thin_dtr(struct dm_target *ti)
2476 {
2477  struct thin_c *tc = ti->private;
2478 
2479  mutex_lock(&dm_thin_pool_table.mutex);
2480 
2481  __pool_dec(tc->pool);
2483  dm_put_device(ti, tc->pool_dev);
2484  if (tc->origin_dev)
2485  dm_put_device(ti, tc->origin_dev);
2486  kfree(tc);
2487 
2488  mutex_unlock(&dm_thin_pool_table.mutex);
2489 }
2490 
2491 /*
2492  * Thin target parameters:
2493  *
2494  * <pool_dev> <dev_id> [origin_dev]
2495  *
2496  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2497  * dev_id: the internal device identifier
2498  * origin_dev: a device external to the pool that should act as the origin
2499  *
2500  * If the pool device has discards disabled, they get disabled for the thin
2501  * device as well.
2502  */
2503 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2504 {
2505  int r;
2506  struct thin_c *tc;
2507  struct dm_dev *pool_dev, *origin_dev;
2508  struct mapped_device *pool_md;
2509 
2510  mutex_lock(&dm_thin_pool_table.mutex);
2511 
2512  if (argc != 2 && argc != 3) {
2513  ti->error = "Invalid argument count";
2514  r = -EINVAL;
2515  goto out_unlock;
2516  }
2517 
2518  tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2519  if (!tc) {
2520  ti->error = "Out of memory";
2521  r = -ENOMEM;
2522  goto out_unlock;
2523  }
2524 
2525  if (argc == 3) {
2526  r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2527  if (r) {
2528  ti->error = "Error opening origin device";
2529  goto bad_origin_dev;
2530  }
2531  tc->origin_dev = origin_dev;
2532  }
2533 
2534  r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2535  if (r) {
2536  ti->error = "Error opening pool device";
2537  goto bad_pool_dev;
2538  }
2539  tc->pool_dev = pool_dev;
2540 
2541  if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2542  ti->error = "Invalid device id";
2543  r = -EINVAL;
2544  goto bad_common;
2545  }
2546 
2547  pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2548  if (!pool_md) {
2549  ti->error = "Couldn't get pool mapped device";
2550  r = -EINVAL;
2551  goto bad_common;
2552  }
2553 
2554  tc->pool = __pool_table_lookup(pool_md);
2555  if (!tc->pool) {
2556  ti->error = "Couldn't find pool object";
2557  r = -EINVAL;
2558  goto bad_pool_lookup;
2559  }
2560  __pool_inc(tc->pool);
2561 
2562  if (get_pool_mode(tc->pool) == PM_FAIL) {
2563  ti->error = "Couldn't open thin device, Pool is in fail mode";
2564  goto bad_thin_open;
2565  }
2566 
2567  r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2568  if (r) {
2569  ti->error = "Couldn't open thin internal device";
2570  goto bad_thin_open;
2571  }
2572 
2573  r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2574  if (r)
2575  goto bad_thin_open;
2576 
2577  ti->num_flush_requests = 1;
2578  ti->flush_supported = true;
2579 
2580  /* In case the pool supports discards, pass them on. */
2581  if (tc->pool->pf.discard_enabled) {
2582  ti->discards_supported = true;
2583  ti->num_discard_requests = 1;
2585  /* Discard requests must be split on a block boundary */
2586  ti->split_discard_requests = true;
2587  }
2588 
2589  dm_put(pool_md);
2590 
2591  mutex_unlock(&dm_thin_pool_table.mutex);
2592 
2593  return 0;
2594 
2595 bad_thin_open:
2596  __pool_dec(tc->pool);
2597 bad_pool_lookup:
2598  dm_put(pool_md);
2599 bad_common:
2600  dm_put_device(ti, tc->pool_dev);
2601 bad_pool_dev:
2602  if (tc->origin_dev)
2603  dm_put_device(ti, tc->origin_dev);
2604 bad_origin_dev:
2605  kfree(tc);
2606 out_unlock:
2607  mutex_unlock(&dm_thin_pool_table.mutex);
2608 
2609  return r;
2610 }
2611 
2612 static int thin_map(struct dm_target *ti, struct bio *bio,
2613  union map_info *map_context)
2614 {
2615  bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2616 
2617  return thin_bio_map(ti, bio, map_context);
2618 }
2619 
2620 static int thin_endio(struct dm_target *ti,
2621  struct bio *bio, int err,
2622  union map_info *map_context)
2623 {
2624  unsigned long flags;
2625  struct dm_thin_endio_hook *h = map_context->ptr;
2626  struct list_head work;
2627  struct dm_thin_new_mapping *m, *tmp;
2628  struct pool *pool = h->tc->pool;
2629 
2630  if (h->shared_read_entry) {
2631  INIT_LIST_HEAD(&work);
2633 
2634  spin_lock_irqsave(&pool->lock, flags);
2635  list_for_each_entry_safe(m, tmp, &work, list) {
2636  list_del(&m->list);
2637  m->quiesced = 1;
2638  __maybe_add_mapping(m);
2639  }
2640  spin_unlock_irqrestore(&pool->lock, flags);
2641  }
2642 
2643  if (h->all_io_entry) {
2644  INIT_LIST_HEAD(&work);
2646  spin_lock_irqsave(&pool->lock, flags);
2648  list_add(&m->list, &pool->prepared_discards);
2649  spin_unlock_irqrestore(&pool->lock, flags);
2650  }
2651 
2652  mempool_free(h, pool->endio_hook_pool);
2653 
2654  return 0;
2655 }
2656 
2657 static void thin_postsuspend(struct dm_target *ti)
2658 {
2659  if (dm_noflush_suspending(ti))
2660  requeue_io((struct thin_c *)ti->private);
2661 }
2662 
2663 /*
2664  * <nr mapped sectors> <highest mapped sector>
2665  */
2666 static int thin_status(struct dm_target *ti, status_type_t type,
2667  unsigned status_flags, char *result, unsigned maxlen)
2668 {
2669  int r;
2670  ssize_t sz = 0;
2672  char buf[BDEVNAME_SIZE];
2673  struct thin_c *tc = ti->private;
2674 
2675  if (get_pool_mode(tc->pool) == PM_FAIL) {
2676  DMEMIT("Fail");
2677  return 0;
2678  }
2679 
2680  if (!tc->td)
2681  DMEMIT("-");
2682  else {
2683  switch (type) {
2684  case STATUSTYPE_INFO:
2685  r = dm_thin_get_mapped_count(tc->td, &mapped);
2686  if (r)
2687  return r;
2688 
2689  r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2690  if (r < 0)
2691  return r;
2692 
2693  DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2694  if (r)
2695  DMEMIT("%llu", ((highest + 1) *
2696  tc->pool->sectors_per_block) - 1);
2697  else
2698  DMEMIT("-");
2699  break;
2700 
2701  case STATUSTYPE_TABLE:
2702  DMEMIT("%s %lu",
2703  format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2704  (unsigned long) tc->dev_id);
2705  if (tc->origin_dev)
2706  DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2707  break;
2708  }
2709  }
2710 
2711  return 0;
2712 }
2713 
2714 static int thin_iterate_devices(struct dm_target *ti,
2716 {
2717  sector_t blocks;
2718  struct thin_c *tc = ti->private;
2719  struct pool *pool = tc->pool;
2720 
2721  /*
2722  * We can't call dm_pool_get_data_dev_size() since that blocks. So
2723  * we follow a more convoluted path through to the pool's target.
2724  */
2725  if (!pool->ti)
2726  return 0; /* nothing is bound */
2727 
2728  blocks = pool->ti->len;
2729  (void) sector_div(blocks, pool->sectors_per_block);
2730  if (blocks)
2731  return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2732 
2733  return 0;
2734 }
2735 
2736 /*
2737  * A thin device always inherits its queue limits from its pool.
2738  */
2739 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2740 {
2741  struct thin_c *tc = ti->private;
2742 
2743  *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
2744 }
2745 
2746 static struct target_type thin_target = {
2747  .name = "thin",
2748  .version = {1, 5, 0},
2749  .module = THIS_MODULE,
2750  .ctr = thin_ctr,
2751  .dtr = thin_dtr,
2752  .map = thin_map,
2753  .end_io = thin_endio,
2754  .postsuspend = thin_postsuspend,
2755  .status = thin_status,
2756  .iterate_devices = thin_iterate_devices,
2757  .io_hints = thin_io_hints,
2758 };
2759 
2760 /*----------------------------------------------------------------*/
2761 
2762 static int __init dm_thin_init(void)
2763 {
2764  int r;
2765 
2766  pool_table_init();
2767 
2768  r = dm_register_target(&thin_target);
2769  if (r)
2770  return r;
2771 
2772  r = dm_register_target(&pool_target);
2773  if (r)
2774  goto bad_pool_target;
2775 
2776  r = -ENOMEM;
2777 
2778  _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2779  if (!_new_mapping_cache)
2780  goto bad_new_mapping_cache;
2781 
2782  _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
2783  if (!_endio_hook_cache)
2784  goto bad_endio_hook_cache;
2785 
2786  return 0;
2787 
2788 bad_endio_hook_cache:
2789  kmem_cache_destroy(_new_mapping_cache);
2790 bad_new_mapping_cache:
2791  dm_unregister_target(&pool_target);
2792 bad_pool_target:
2793  dm_unregister_target(&thin_target);
2794 
2795  return r;
2796 }
2797 
2798 static void dm_thin_exit(void)
2799 {
2800  dm_unregister_target(&thin_target);
2801  dm_unregister_target(&pool_target);
2802 
2803  kmem_cache_destroy(_new_mapping_cache);
2804  kmem_cache_destroy(_endio_hook_cache);
2805 }
2806 
2807 module_init(dm_thin_init);
2808 module_exit(dm_thin_exit);
2809 
2810 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2811 MODULE_AUTHOR("Joe Thornber <[email protected]>");
2812 MODULE_LICENSE("GPL");