Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
drbd_bitmap.c
Go to the documentation of this file.
1 /*
2  drbd_bitmap.c
3 
4  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5 
6  Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7  Copyright (C) 2004-2008, Philipp Reisner <[email protected]>.
8  Copyright (C) 2004-2008, Lars Ellenberg <[email protected]>.
9 
10  drbd is free software; you can redistribute it and/or modify
11  it under the terms of the GNU General Public License as published by
12  the Free Software Foundation; either version 2, or (at your option)
13  any later version.
14 
15  drbd is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  GNU General Public License for more details.
19 
20  You should have received a copy of the GNU General Public License
21  along with drbd; see the file COPYING. If not, write to
22  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 
25 #include <linux/bitops.h>
26 #include <linux/vmalloc.h>
27 #include <linux/string.h>
28 #include <linux/drbd.h>
29 #include <linux/slab.h>
30 #include <asm/kmap_types.h>
31 
32 #include "drbd_int.h"
33 
34 
35 /* OPAQUE outside this file!
36  * interface defined in drbd_int.h
37 
38  * convention:
39  * function name drbd_bm_... => used elsewhere, "public".
40  * function name bm_... => internal to implementation, "private".
41  */
42 
43 
44 /*
45  * LIMITATIONS:
46  * We want to support >= peta byte of backend storage, while for now still using
47  * a granularity of one bit per 4KiB of storage.
48  * 1 << 50 bytes backend storage (1 PiB)
49  * 1 << (50 - 12) bits needed
50  * 38 --> we need u64 to index and count bits
51  * 1 << (38 - 3) bitmap bytes needed
52  * 35 --> we still need u64 to index and count bytes
53  * (that's 32 GiB of bitmap for 1 PiB storage)
54  * 1 << (35 - 2) 32bit longs needed
55  * 33 --> we'd even need u64 to index and count 32bit long words.
56  * 1 << (35 - 3) 64bit longs needed
57  * 32 --> we could get away with a 32bit unsigned int to index and count
58  * 64bit long words, but I rather stay with unsigned long for now.
59  * We probably should neither count nor point to bytes or long words
60  * directly, but either by bitnumber, or by page index and offset.
61  * 1 << (35 - 12)
62  * 22 --> we need that much 4KiB pages of bitmap.
63  * 1 << (22 + 3) --> on a 64bit arch,
64  * we need 32 MiB to store the array of page pointers.
65  *
66  * Because I'm lazy, and because the resulting patch was too large, too ugly
67  * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
68  * (1 << 32) bits * 4k storage.
69  *
70 
71  * bitmap storage and IO:
72  * Bitmap is stored little endian on disk, and is kept little endian in
73  * core memory. Currently we still hold the full bitmap in core as long
74  * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
75  * seems excessive.
76  *
77  * We plan to reduce the amount of in-core bitmap pages by paging them in
78  * and out against their on-disk location as necessary, but need to make
79  * sure we don't cause too much meta data IO, and must not deadlock in
80  * tight memory situations. This needs some more work.
81  */
82 
83 /*
84  * NOTE
85  * Access to the *bm_pages is protected by bm_lock.
86  * It is safe to read the other members within the lock.
87  *
88  * drbd_bm_set_bits is called from bio_endio callbacks,
89  * We may be called with irq already disabled,
90  * so we need spin_lock_irqsave().
91  * And we need the kmap_atomic.
92  */
93 struct drbd_bitmap {
94  struct page **bm_pages;
96 
97  /* see LIMITATIONS: above */
98 
99  unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
100  unsigned long bm_bits;
101  size_t bm_words;
104  struct mutex bm_change; /* serializes resize operations */
105 
106  wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
107 
109 
110  /* debugging aid, in case we are still racy somewhere */
111  char *bm_why;
113 };
114 
115 #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
116 static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
117 {
118  struct drbd_bitmap *b = mdev->bitmap;
120  return;
121  dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
122  current == mdev->receiver.task ? "receiver" :
123  current == mdev->asender.task ? "asender" :
124  current == mdev->worker.task ? "worker" : current->comm,
125  func, b->bm_why ?: "?",
126  b->bm_task == mdev->receiver.task ? "receiver" :
127  b->bm_task == mdev->asender.task ? "asender" :
128  b->bm_task == mdev->worker.task ? "worker" : "?");
129 }
130 
131 void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
132 {
133  struct drbd_bitmap *b = mdev->bitmap;
134  int trylock_failed;
135 
136  if (!b) {
137  dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
138  return;
139  }
140 
141  trylock_failed = !mutex_trylock(&b->bm_change);
142 
143  if (trylock_failed) {
144  dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
145  current == mdev->receiver.task ? "receiver" :
146  current == mdev->asender.task ? "asender" :
147  current == mdev->worker.task ? "worker" : current->comm,
148  why, b->bm_why ?: "?",
149  b->bm_task == mdev->receiver.task ? "receiver" :
150  b->bm_task == mdev->asender.task ? "asender" :
151  b->bm_task == mdev->worker.task ? "worker" : "?");
152  mutex_lock(&b->bm_change);
153  }
154  if (BM_LOCKED_MASK & b->bm_flags)
155  dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
156  b->bm_flags |= flags & BM_LOCKED_MASK;
157 
158  b->bm_why = why;
159  b->bm_task = current;
160 }
161 
162 void drbd_bm_unlock(struct drbd_conf *mdev)
163 {
164  struct drbd_bitmap *b = mdev->bitmap;
165  if (!b) {
166  dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
167  return;
168  }
169 
170  if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
171  dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
172 
173  b->bm_flags &= ~BM_LOCKED_MASK;
174  b->bm_why = NULL;
175  b->bm_task = NULL;
176  mutex_unlock(&b->bm_change);
177 }
178 
179 /* we store some "meta" info about our pages in page->private */
180 /* at a granularity of 4k storage per bitmap bit:
181  * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
182  * 1<<38 bits,
183  * 1<<23 4k bitmap pages.
184  * Use 24 bits as page index, covers 2 peta byte storage
185  * at a granularity of 4k per bit.
186  * Used to report the failed page idx on io error from the endio handlers.
187  */
188 #define BM_PAGE_IDX_MASK ((1UL<<24)-1)
189 /* this page is currently read in, or written back */
190 #define BM_PAGE_IO_LOCK 31
191 /* if there has been an IO error for this page */
192 #define BM_PAGE_IO_ERROR 30
193 /* this is to be able to intelligently skip disk IO,
194  * set if bits have been set since last IO. */
195 #define BM_PAGE_NEED_WRITEOUT 29
196 /* to mark for lazy writeout once syncer cleared all clearable bits,
197  * we if bits have been cleared since last IO. */
198 #define BM_PAGE_LAZY_WRITEOUT 28
199 
200 /* store_page_idx uses non-atomic assignment. It is only used directly after
201  * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
202  * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
203  * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
204  * requires it all to be atomic as well. */
205 static void bm_store_page_idx(struct page *page, unsigned long idx)
206 {
207  BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
208  set_page_private(page, idx);
209 }
210 
211 static unsigned long bm_page_to_idx(struct page *page)
212 {
213  return page_private(page) & BM_PAGE_IDX_MASK;
214 }
215 
216 /* As is very unlikely that the same page is under IO from more than one
217  * context, we can get away with a bit per page and one wait queue per bitmap.
218  */
219 static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
220 {
221  struct drbd_bitmap *b = mdev->bitmap;
222  void *addr = &page_private(b->bm_pages[page_nr]);
224 }
225 
226 static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
227 {
228  struct drbd_bitmap *b = mdev->bitmap;
229  void *addr = &page_private(b->bm_pages[page_nr]);
230  clear_bit(BM_PAGE_IO_LOCK, addr);
232  wake_up(&mdev->bitmap->bm_io_wait);
233 }
234 
235 /* set _before_ submit_io, so it may be reset due to being changed
236  * while this page is in flight... will get submitted later again */
237 static void bm_set_page_unchanged(struct page *page)
238 {
239  /* use cmpxchg? */
240  clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
241  clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
242 }
243 
244 static void bm_set_page_need_writeout(struct page *page)
245 {
246  set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
247 }
248 
249 static int bm_test_page_unchanged(struct page *page)
250 {
251  volatile const unsigned long *addr = &page_private(page);
252  return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
253 }
254 
255 static void bm_set_page_io_err(struct page *page)
256 {
257  set_bit(BM_PAGE_IO_ERROR, &page_private(page));
258 }
259 
260 static void bm_clear_page_io_err(struct page *page)
261 {
262  clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
263 }
264 
265 static void bm_set_page_lazy_writeout(struct page *page)
266 {
267  set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
268 }
269 
270 static int bm_test_page_lazy_writeout(struct page *page)
271 {
272  return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
273 }
274 
275 /* on a 32bit box, this would allow for exactly (2<<38) bits. */
276 static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
277 {
278  /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
279  unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
280  BUG_ON(page_nr >= b->bm_number_of_pages);
281  return page_nr;
282 }
283 
284 static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
285 {
286  /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
287  unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
288  BUG_ON(page_nr >= b->bm_number_of_pages);
289  return page_nr;
290 }
291 
292 static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
293 {
294  struct page *page = b->bm_pages[idx];
295  return (unsigned long *) kmap_atomic(page);
296 }
297 
298 static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
299 {
300  return __bm_map_pidx(b, idx);
301 }
302 
303 static void __bm_unmap(unsigned long *p_addr)
304 {
305  kunmap_atomic(p_addr);
306 };
307 
308 static void bm_unmap(unsigned long *p_addr)
309 {
310  return __bm_unmap(p_addr);
311 }
312 
313 /* long word offset of _bitmap_ sector */
314 #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
315 /* word offset from start of bitmap to word number _in_page_
316  * modulo longs per page
317 #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
318  hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
319  so do it explicitly:
320  */
321 #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
322 
323 /* Long words per page */
324 #define LWPP (PAGE_SIZE/sizeof(long))
325 
326 /*
327  * actually most functions herein should take a struct drbd_bitmap*, not a
328  * struct drbd_conf*, but for the debug macros I like to have the mdev around
329  * to be able to report device specific.
330  */
331 
332 
333 static void bm_free_pages(struct page **pages, unsigned long number)
334 {
335  unsigned long i;
336  if (!pages)
337  return;
338 
339  for (i = 0; i < number; i++) {
340  if (!pages[i]) {
341  printk(KERN_ALERT "drbd: bm_free_pages tried to free "
342  "a NULL pointer; i=%lu n=%lu\n",
343  i, number);
344  continue;
345  }
346  __free_page(pages[i]);
347  pages[i] = NULL;
348  }
349 }
350 
351 static void bm_vk_free(void *ptr, int v)
352 {
353  if (v)
354  vfree(ptr);
355  else
356  kfree(ptr);
357 }
358 
359 /*
360  * "have" and "want" are NUMBER OF PAGES.
361  */
362 static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
363 {
364  struct page **old_pages = b->bm_pages;
365  struct page **new_pages, *page;
366  unsigned int i, bytes, vmalloced = 0;
367  unsigned long have = b->bm_number_of_pages;
368 
369  BUG_ON(have == 0 && old_pages != NULL);
370  BUG_ON(have != 0 && old_pages == NULL);
371 
372  if (have == want)
373  return old_pages;
374 
375  /* Trying kmalloc first, falling back to vmalloc.
376  * GFP_KERNEL is ok, as this is done when a lower level disk is
377  * "attached" to the drbd. Context is receiver thread or cqueue
378  * thread. As we have no disk yet, we are not in the IO path,
379  * not even the IO path of the peer. */
380  bytes = sizeof(struct page *)*want;
381  new_pages = kzalloc(bytes, GFP_KERNEL);
382  if (!new_pages) {
383  new_pages = vzalloc(bytes);
384  if (!new_pages)
385  return NULL;
386  vmalloced = 1;
387  }
388 
389  if (want >= have) {
390  for (i = 0; i < have; i++)
391  new_pages[i] = old_pages[i];
392  for (; i < want; i++) {
393  page = alloc_page(GFP_HIGHUSER);
394  if (!page) {
395  bm_free_pages(new_pages + have, i - have);
396  bm_vk_free(new_pages, vmalloced);
397  return NULL;
398  }
399  /* we want to know which page it is
400  * from the endio handlers */
401  bm_store_page_idx(page, i);
402  new_pages[i] = page;
403  }
404  } else {
405  for (i = 0; i < want; i++)
406  new_pages[i] = old_pages[i];
407  /* NOT HERE, we are outside the spinlock!
408  bm_free_pages(old_pages + want, have - want);
409  */
410  }
411 
412  if (vmalloced)
413  b->bm_flags |= BM_P_VMALLOCED;
414  else
415  b->bm_flags &= ~BM_P_VMALLOCED;
416 
417  return new_pages;
418 }
419 
420 /*
421  * called on driver init only. TODO call when a device is created.
422  * allocates the drbd_bitmap, and stores it in mdev->bitmap.
423  */
424 int drbd_bm_init(struct drbd_conf *mdev)
425 {
426  struct drbd_bitmap *b = mdev->bitmap;
427  WARN_ON(b != NULL);
428  b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
429  if (!b)
430  return -ENOMEM;
431  spin_lock_init(&b->bm_lock);
432  mutex_init(&b->bm_change);
434 
435  mdev->bitmap = b;
436 
437  return 0;
438 }
439 
441 {
442  ERR_IF(!mdev->bitmap) return 0;
443  return mdev->bitmap->bm_dev_capacity;
444 }
445 
446 /* called on driver unload. TODO: call when a device is destroyed.
447  */
448 void drbd_bm_cleanup(struct drbd_conf *mdev)
449 {
450  ERR_IF (!mdev->bitmap) return;
451  bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
452  bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
453  kfree(mdev->bitmap);
454  mdev->bitmap = NULL;
455 }
456 
457 /*
458  * since (b->bm_bits % BITS_PER_LONG) != 0,
459  * this masks out the remaining bits.
460  * Returns the number of bits cleared.
461  */
462 #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
463 #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
464 #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
465 static int bm_clear_surplus(struct drbd_bitmap *b)
466 {
467  unsigned long mask;
468  unsigned long *p_addr, *bm;
469  int tmp;
470  int cleared = 0;
471 
472  /* number of bits modulo bits per page */
473  tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
474  /* mask the used bits of the word containing the last bit */
475  mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
476  /* bitmap is always stored little endian,
477  * on disk and in core memory alike */
478  mask = cpu_to_lel(mask);
479 
480  p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
481  bm = p_addr + (tmp/BITS_PER_LONG);
482  if (mask) {
483  /* If mask != 0, we are not exactly aligned, so bm now points
484  * to the long containing the last bit.
485  * If mask == 0, bm already points to the word immediately
486  * after the last (long word aligned) bit. */
487  cleared = hweight_long(*bm & ~mask);
488  *bm &= mask;
489  bm++;
490  }
491 
492  if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
493  /* on a 32bit arch, we may need to zero out
494  * a padding long to align with a 64bit remote */
495  cleared += hweight_long(*bm);
496  *bm = 0;
497  }
498  bm_unmap(p_addr);
499  return cleared;
500 }
501 
502 static void bm_set_surplus(struct drbd_bitmap *b)
503 {
504  unsigned long mask;
505  unsigned long *p_addr, *bm;
506  int tmp;
507 
508  /* number of bits modulo bits per page */
509  tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
510  /* mask the used bits of the word containing the last bit */
511  mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
512  /* bitmap is always stored little endian,
513  * on disk and in core memory alike */
514  mask = cpu_to_lel(mask);
515 
516  p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
517  bm = p_addr + (tmp/BITS_PER_LONG);
518  if (mask) {
519  /* If mask != 0, we are not exactly aligned, so bm now points
520  * to the long containing the last bit.
521  * If mask == 0, bm already points to the word immediately
522  * after the last (long word aligned) bit. */
523  *bm |= ~mask;
524  bm++;
525  }
526 
527  if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
528  /* on a 32bit arch, we may need to zero out
529  * a padding long to align with a 64bit remote */
530  *bm = ~0UL;
531  }
532  bm_unmap(p_addr);
533 }
534 
535 /* you better not modify the bitmap while this is running,
536  * or its results will be stale */
537 static unsigned long bm_count_bits(struct drbd_bitmap *b)
538 {
539  unsigned long *p_addr;
540  unsigned long bits = 0;
541  unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
542  int idx, i, last_word;
543 
544  /* all but last page */
545  for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
546  p_addr = __bm_map_pidx(b, idx);
547  for (i = 0; i < LWPP; i++)
548  bits += hweight_long(p_addr[i]);
549  __bm_unmap(p_addr);
550  cond_resched();
551  }
552  /* last (or only) page */
553  last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
554  p_addr = __bm_map_pidx(b, idx);
555  for (i = 0; i < last_word; i++)
556  bits += hweight_long(p_addr[i]);
557  p_addr[last_word] &= cpu_to_lel(mask);
558  bits += hweight_long(p_addr[last_word]);
559  /* 32bit arch, may have an unused padding long */
560  if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
561  p_addr[last_word+1] = 0;
562  __bm_unmap(p_addr);
563  return bits;
564 }
565 
566 /* offset and len in long words.*/
567 static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
568 {
569  unsigned long *p_addr, *bm;
570  unsigned int idx;
571  size_t do_now, end;
572 
573  end = offset + len;
574 
575  if (end > b->bm_words) {
576  printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
577  return;
578  }
579 
580  while (offset < end) {
581  do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
582  idx = bm_word_to_page_idx(b, offset);
583  p_addr = bm_map_pidx(b, idx);
584  bm = p_addr + MLPP(offset);
585  if (bm+do_now > p_addr + LWPP) {
586  printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
587  p_addr, bm, (int)do_now);
588  } else
589  memset(bm, c, do_now * sizeof(long));
590  bm_unmap(p_addr);
591  bm_set_page_need_writeout(b->bm_pages[idx]);
592  offset += do_now;
593  }
594 }
595 
596 /*
597  * make sure the bitmap has enough room for the attached storage,
598  * if necessary, resize.
599  * called whenever we may have changed the device size.
600  * returns -ENOMEM if we could not allocate enough memory, 0 on success.
601  * In case this is actually a resize, we copy the old bitmap into the new one.
602  * Otherwise, the bitmap is initialized to all bits set.
603  */
604 int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
605 {
606  struct drbd_bitmap *b = mdev->bitmap;
607  unsigned long bits, words, owords, obits;
608  unsigned long want, have, onpages; /* number of pages */
609  struct page **npages, **opages = NULL;
610  int err = 0, growing;
611  int opages_vmalloced;
612 
613  ERR_IF(!b) return -ENOMEM;
614 
615  drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
616 
617  dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
618  (unsigned long long)capacity);
619 
620  if (capacity == b->bm_dev_capacity)
621  goto out;
622 
623  opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
624 
625  if (capacity == 0) {
626  spin_lock_irq(&b->bm_lock);
627  opages = b->bm_pages;
628  onpages = b->bm_number_of_pages;
629  owords = b->bm_words;
630  b->bm_pages = NULL;
631  b->bm_number_of_pages =
632  b->bm_set =
633  b->bm_bits =
634  b->bm_words =
635  b->bm_dev_capacity = 0;
636  spin_unlock_irq(&b->bm_lock);
637  bm_free_pages(opages, onpages);
638  bm_vk_free(opages, opages_vmalloced);
639  goto out;
640  }
641  bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
642 
643  /* if we would use
644  words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
645  a 32bit host could present the wrong number of words
646  to a 64bit host.
647  */
648  words = ALIGN(bits, 64) >> LN2_BPL;
649 
650  if (get_ldev(mdev)) {
651  u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
652  put_ldev(mdev);
653  if (bits > bits_on_disk) {
654  dev_info(DEV, "bits = %lu\n", bits);
655  dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
656  err = -ENOSPC;
657  goto out;
658  }
659  }
660 
661  want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
662  have = b->bm_number_of_pages;
663  if (want == have) {
664  D_ASSERT(b->bm_pages != NULL);
665  npages = b->bm_pages;
666  } else {
667  if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
668  npages = NULL;
669  else
670  npages = bm_realloc_pages(b, want);
671  }
672 
673  if (!npages) {
674  err = -ENOMEM;
675  goto out;
676  }
677 
678  spin_lock_irq(&b->bm_lock);
679  opages = b->bm_pages;
680  owords = b->bm_words;
681  obits = b->bm_bits;
682 
683  growing = bits > obits;
684  if (opages && growing && set_new_bits)
685  bm_set_surplus(b);
686 
687  b->bm_pages = npages;
688  b->bm_number_of_pages = want;
689  b->bm_bits = bits;
690  b->bm_words = words;
692 
693  if (growing) {
694  if (set_new_bits) {
695  bm_memset(b, owords, 0xff, words-owords);
696  b->bm_set += bits - obits;
697  } else
698  bm_memset(b, owords, 0x00, words-owords);
699 
700  }
701 
702  if (want < have) {
703  /* implicit: (opages != NULL) && (opages != npages) */
704  bm_free_pages(opages + want, have - want);
705  }
706 
707  (void)bm_clear_surplus(b);
708 
709  spin_unlock_irq(&b->bm_lock);
710  if (opages != npages)
711  bm_vk_free(opages, opages_vmalloced);
712  if (!growing)
713  b->bm_set = bm_count_bits(b);
714  dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
715 
716  out:
717  drbd_bm_unlock(mdev);
718  return err;
719 }
720 
721 /* inherently racy:
722  * if not protected by other means, return value may be out of date when
723  * leaving this function...
724  * we still need to lock it, since it is important that this returns
725  * bm_set == 0 precisely.
726  *
727  * maybe bm_set should be atomic_t ?
728  */
729 unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
730 {
731  struct drbd_bitmap *b = mdev->bitmap;
732  unsigned long s;
733  unsigned long flags;
734 
735  ERR_IF(!b) return 0;
736  ERR_IF(!b->bm_pages) return 0;
737 
738  spin_lock_irqsave(&b->bm_lock, flags);
739  s = b->bm_set;
740  spin_unlock_irqrestore(&b->bm_lock, flags);
741 
742  return s;
743 }
744 
745 unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
746 {
747  unsigned long s;
748  /* if I don't have a disk, I don't know about out-of-sync status */
749  if (!get_ldev_if_state(mdev, D_NEGOTIATING))
750  return 0;
751  s = _drbd_bm_total_weight(mdev);
752  put_ldev(mdev);
753  return s;
754 }
755 
756 size_t drbd_bm_words(struct drbd_conf *mdev)
757 {
758  struct drbd_bitmap *b = mdev->bitmap;
759  ERR_IF(!b) return 0;
760  ERR_IF(!b->bm_pages) return 0;
761 
762  return b->bm_words;
763 }
764 
765 unsigned long drbd_bm_bits(struct drbd_conf *mdev)
766 {
767  struct drbd_bitmap *b = mdev->bitmap;
768  ERR_IF(!b) return 0;
769 
770  return b->bm_bits;
771 }
772 
773 /* merge number words from buffer into the bitmap starting at offset.
774  * buffer[i] is expected to be little endian unsigned long.
775  * bitmap must be locked by drbd_bm_lock.
776  * currently only used from receive_bitmap.
777  */
778 void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
779  unsigned long *buffer)
780 {
781  struct drbd_bitmap *b = mdev->bitmap;
782  unsigned long *p_addr, *bm;
783  unsigned long word, bits;
784  unsigned int idx;
785  size_t end, do_now;
786 
787  end = offset + number;
788 
789  ERR_IF(!b) return;
790  ERR_IF(!b->bm_pages) return;
791  if (number == 0)
792  return;
793  WARN_ON(offset >= b->bm_words);
794  WARN_ON(end > b->bm_words);
795 
796  spin_lock_irq(&b->bm_lock);
797  while (offset < end) {
798  do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
799  idx = bm_word_to_page_idx(b, offset);
800  p_addr = bm_map_pidx(b, idx);
801  bm = p_addr + MLPP(offset);
802  offset += do_now;
803  while (do_now--) {
804  bits = hweight_long(*bm);
805  word = *bm | *buffer++;
806  *bm++ = word;
807  b->bm_set += hweight_long(word) - bits;
808  }
809  bm_unmap(p_addr);
810  bm_set_page_need_writeout(b->bm_pages[idx]);
811  }
812  /* with 32bit <-> 64bit cross-platform connect
813  * this is only correct for current usage,
814  * where we _know_ that we are 64 bit aligned,
815  * and know that this function is used in this way, too...
816  */
817  if (end == b->bm_words)
818  b->bm_set -= bm_clear_surplus(b);
819  spin_unlock_irq(&b->bm_lock);
820 }
821 
822 /* copy number words from the bitmap starting at offset into the buffer.
823  * buffer[i] will be little endian unsigned long.
824  */
825 void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
826  unsigned long *buffer)
827 {
828  struct drbd_bitmap *b = mdev->bitmap;
829  unsigned long *p_addr, *bm;
830  size_t end, do_now;
831 
832  end = offset + number;
833 
834  ERR_IF(!b) return;
835  ERR_IF(!b->bm_pages) return;
836 
837  spin_lock_irq(&b->bm_lock);
838  if ((offset >= b->bm_words) ||
839  (end > b->bm_words) ||
840  (number <= 0))
841  dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
842  (unsigned long) offset,
843  (unsigned long) number,
844  (unsigned long) b->bm_words);
845  else {
846  while (offset < end) {
847  do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
848  p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
849  bm = p_addr + MLPP(offset);
850  offset += do_now;
851  while (do_now--)
852  *buffer++ = *bm++;
853  bm_unmap(p_addr);
854  }
855  }
856  spin_unlock_irq(&b->bm_lock);
857 }
858 
859 /* set all bits in the bitmap */
860 void drbd_bm_set_all(struct drbd_conf *mdev)
861 {
862  struct drbd_bitmap *b = mdev->bitmap;
863  ERR_IF(!b) return;
864  ERR_IF(!b->bm_pages) return;
865 
866  spin_lock_irq(&b->bm_lock);
867  bm_memset(b, 0, 0xff, b->bm_words);
868  (void)bm_clear_surplus(b);
869  b->bm_set = b->bm_bits;
870  spin_unlock_irq(&b->bm_lock);
871 }
872 
873 /* clear all bits in the bitmap */
874 void drbd_bm_clear_all(struct drbd_conf *mdev)
875 {
876  struct drbd_bitmap *b = mdev->bitmap;
877  ERR_IF(!b) return;
878  ERR_IF(!b->bm_pages) return;
879 
880  spin_lock_irq(&b->bm_lock);
881  bm_memset(b, 0, 0, b->bm_words);
882  b->bm_set = 0;
883  spin_unlock_irq(&b->bm_lock);
884 }
885 
886 struct bm_aio_ctx {
887  struct drbd_conf *mdev;
889  unsigned int done;
890  unsigned flags;
891 #define BM_AIO_COPY_PAGES 1
892 #define BM_WRITE_ALL_PAGES 2
893  int error;
894  struct kref kref;
895 };
896 
897 static void bm_aio_ctx_destroy(struct kref *kref)
898 {
899  struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
900 
901  put_ldev(ctx->mdev);
902  kfree(ctx);
903 }
904 
905 /* bv_page may be a copy, or may be the original */
906 static void bm_async_io_complete(struct bio *bio, int error)
907 {
908  struct bm_aio_ctx *ctx = bio->bi_private;
909  struct drbd_conf *mdev = ctx->mdev;
910  struct drbd_bitmap *b = mdev->bitmap;
911  unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
912  int uptodate = bio_flagged(bio, BIO_UPTODATE);
913 
914 
915  /* strange behavior of some lower level drivers...
916  * fail the request by clearing the uptodate flag,
917  * but do not return any error?!
918  * do we want to WARN() on this? */
919  if (!error && !uptodate)
920  error = -EIO;
921 
922  if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
923  !bm_test_page_unchanged(b->bm_pages[idx]))
924  dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
925 
926  if (error) {
927  /* ctx error will hold the completed-last non-zero error code,
928  * in case error codes differ. */
929  ctx->error = error;
930  bm_set_page_io_err(b->bm_pages[idx]);
931  /* Not identical to on disk version of it.
932  * Is BM_PAGE_IO_ERROR enough? */
933  if (__ratelimit(&drbd_ratelimit_state))
934  dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
935  error, idx);
936  } else {
937  bm_clear_page_io_err(b->bm_pages[idx]);
938  dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
939  }
940 
941  bm_page_unlock_io(mdev, idx);
942 
943  if (ctx->flags & BM_AIO_COPY_PAGES)
944  mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
945 
946  bio_put(bio);
947 
948  if (atomic_dec_and_test(&ctx->in_flight)) {
949  ctx->done = 1;
950  wake_up(&mdev->misc_wait);
951  kref_put(&ctx->kref, &bm_aio_ctx_destroy);
952  }
953 }
954 
955 static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
956 {
957  struct bio *bio = bio_alloc_drbd(GFP_NOIO);
958  struct drbd_conf *mdev = ctx->mdev;
959  struct drbd_bitmap *b = mdev->bitmap;
960  struct page *page;
961  unsigned int len;
962 
963  sector_t on_disk_sector =
964  mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
965  on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
966 
967  /* this might happen with very small
968  * flexible external meta data device,
969  * or with PAGE_SIZE > 4k */
970  len = min_t(unsigned int, PAGE_SIZE,
971  (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
972 
973  /* serialize IO on this page */
974  bm_page_lock_io(mdev, page_nr);
975  /* before memcpy and submit,
976  * so it can be redirtied any time */
977  bm_set_page_unchanged(b->bm_pages[page_nr]);
978 
979  if (ctx->flags & BM_AIO_COPY_PAGES) {
980  void *src, *dest;
982  dest = kmap_atomic(page);
983  src = kmap_atomic(b->bm_pages[page_nr]);
984  memcpy(dest, src, PAGE_SIZE);
985  kunmap_atomic(src);
986  kunmap_atomic(dest);
987  bm_store_page_idx(page, page_nr);
988  } else
989  page = b->bm_pages[page_nr];
990 
991  bio->bi_bdev = mdev->ldev->md_bdev;
992  bio->bi_sector = on_disk_sector;
993  /* bio_add_page of a single page to an empty bio will always succeed,
994  * according to api. Do we want to assert that? */
995  bio_add_page(bio, page, len, 0);
996  bio->bi_private = ctx;
997  bio->bi_end_io = bm_async_io_complete;
998 
999  if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
1000  bio->bi_rw |= rw;
1001  bio_endio(bio, -EIO);
1002  } else {
1003  submit_bio(rw, bio);
1004  /* this should not count as user activity and cause the
1005  * resync to throttle -- see drbd_rs_should_slow_down(). */
1006  atomic_add(len >> 9, &mdev->rs_sect_ev);
1007  }
1008 }
1009 
1010 /*
1011  * bm_rw: read/write the whole bitmap from/to its on disk location.
1012  */
1013 static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
1014 {
1015  struct bm_aio_ctx *ctx;
1016  struct drbd_bitmap *b = mdev->bitmap;
1017  int num_pages, i, count = 0;
1018  unsigned long now;
1019  char ppb[10];
1020  int err = 0;
1021 
1022  /*
1023  * We are protected against bitmap disappearing/resizing by holding an
1024  * ldev reference (caller must have called get_ldev()).
1025  * For read/write, we are protected against changes to the bitmap by
1026  * the bitmap lock (see drbd_bitmap_io).
1027  * For lazy writeout, we don't care for ongoing changes to the bitmap,
1028  * as we submit copies of pages anyways.
1029  */
1030 
1031  ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
1032  if (!ctx)
1033  return -ENOMEM;
1034 
1035  *ctx = (struct bm_aio_ctx) {
1036  .mdev = mdev,
1037  .in_flight = ATOMIC_INIT(1),
1038  .done = 0,
1039  .flags = flags,
1040  .error = 0,
1041  .kref = { ATOMIC_INIT(2) },
1042  };
1043 
1044  if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
1045  dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
1046  kfree(ctx);
1047  return -ENODEV;
1048  }
1049 
1050  if (!ctx->flags)
1051  WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1052 
1053  num_pages = b->bm_number_of_pages;
1054 
1055  now = jiffies;
1056 
1057  /* let the layers below us try to merge these bios... */
1058  for (i = 0; i < num_pages; i++) {
1059  /* ignore completely unchanged pages */
1060  if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1061  break;
1062  if (rw & WRITE) {
1063  if (!(flags & BM_WRITE_ALL_PAGES) &&
1064  bm_test_page_unchanged(b->bm_pages[i])) {
1065  dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1066  continue;
1067  }
1068  /* during lazy writeout,
1069  * ignore those pages not marked for lazy writeout. */
1070  if (lazy_writeout_upper_idx &&
1071  !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1072  dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1073  continue;
1074  }
1075  }
1076  atomic_inc(&ctx->in_flight);
1077  bm_page_io_async(ctx, i, rw);
1078  ++count;
1079  cond_resched();
1080  }
1081 
1082  /*
1083  * We initialize ctx->in_flight to one to make sure bm_async_io_complete
1084  * will not set ctx->done early, and decrement / test it here. If there
1085  * are still some bios in flight, we need to wait for them here.
1086  * If all IO is done already (or nothing had been submitted), there is
1087  * no need to wait. Still, we need to put the kref associated with the
1088  * "in_flight reached zero, all done" event.
1089  */
1090  if (!atomic_dec_and_test(&ctx->in_flight))
1091  wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
1092  else
1093  kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1094 
1095  dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1096  rw == WRITE ? "WRITE" : "READ",
1097  count, jiffies - now);
1098 
1099  if (ctx->error) {
1100  dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
1102  err = -EIO; /* ctx->error ? */
1103  }
1104 
1105  if (atomic_read(&ctx->in_flight))
1106  err = -EIO; /* Disk failed during IO... */
1107 
1108  now = jiffies;
1109  if (rw == WRITE) {
1110  drbd_md_flush(mdev);
1111  } else /* rw == READ */ {
1112  b->bm_set = bm_count_bits(b);
1113  dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
1114  jiffies - now);
1115  }
1116  now = b->bm_set;
1117 
1118  dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1119  ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
1120 
1121  kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1122  return err;
1123 }
1124 
1129 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
1130 {
1131  return bm_rw(mdev, READ, 0, 0);
1132 }
1133 
1140 int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
1141 {
1142  return bm_rw(mdev, WRITE, 0, 0);
1143 }
1144 
1151 int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
1152 {
1153  return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
1154 }
1155 
1161 int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
1162 {
1163  return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
1164 }
1165 
1178 {
1179  return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
1180 }
1181 
1182 
1195 int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
1196 {
1197  struct bm_aio_ctx *ctx;
1198  int err;
1199 
1200  if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1201  dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1202  return 0;
1203  }
1204 
1205  ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
1206  if (!ctx)
1207  return -ENOMEM;
1208 
1209  *ctx = (struct bm_aio_ctx) {
1210  .mdev = mdev,
1211  .in_flight = ATOMIC_INIT(1),
1212  .done = 0,
1213  .flags = BM_AIO_COPY_PAGES,
1214  .error = 0,
1215  .kref = { ATOMIC_INIT(2) },
1216  };
1217 
1218  if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
1219  dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
1220  kfree(ctx);
1221  return -ENODEV;
1222  }
1223 
1224  bm_page_io_async(ctx, idx, WRITE_SYNC);
1225  wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
1226 
1227  if (ctx->error)
1229  /* that should force detach, so the in memory bitmap will be
1230  * gone in a moment as well. */
1231 
1232  mdev->bm_writ_cnt++;
1233  err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
1234  kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1235  return err;
1236 }
1237 
1238 /* NOTE
1239  * find_first_bit returns int, we return unsigned long.
1240  * For this to work on 32bit arch with bitnumbers > (1<<32),
1241  * we'd need to return u64, and get a whole lot of other places
1242  * fixed where we still use unsigned long.
1243  *
1244  * this returns a bit number, NOT a sector!
1245  */
1246 static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
1247  const int find_zero_bit)
1248 {
1249  struct drbd_bitmap *b = mdev->bitmap;
1250  unsigned long *p_addr;
1251  unsigned long bit_offset;
1252  unsigned i;
1253 
1254 
1255  if (bm_fo > b->bm_bits) {
1256  dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1257  bm_fo = DRBD_END_OF_BITMAP;
1258  } else {
1259  while (bm_fo < b->bm_bits) {
1260  /* bit offset of the first bit in the page */
1261  bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
1262  p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
1263 
1264  if (find_zero_bit)
1265  i = find_next_zero_bit_le(p_addr,
1266  PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1267  else
1268  i = find_next_bit_le(p_addr,
1269  PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1270 
1271  __bm_unmap(p_addr);
1272  if (i < PAGE_SIZE*8) {
1273  bm_fo = bit_offset + i;
1274  if (bm_fo >= b->bm_bits)
1275  break;
1276  goto found;
1277  }
1278  bm_fo = bit_offset + PAGE_SIZE*8;
1279  }
1280  bm_fo = DRBD_END_OF_BITMAP;
1281  }
1282  found:
1283  return bm_fo;
1284 }
1285 
1286 static unsigned long bm_find_next(struct drbd_conf *mdev,
1287  unsigned long bm_fo, const int find_zero_bit)
1288 {
1289  struct drbd_bitmap *b = mdev->bitmap;
1290  unsigned long i = DRBD_END_OF_BITMAP;
1291 
1292  ERR_IF(!b) return i;
1293  ERR_IF(!b->bm_pages) return i;
1294 
1295  spin_lock_irq(&b->bm_lock);
1296  if (BM_DONT_TEST & b->bm_flags)
1297  bm_print_lock_info(mdev);
1298 
1299  i = __bm_find_next(mdev, bm_fo, find_zero_bit);
1300 
1301  spin_unlock_irq(&b->bm_lock);
1302  return i;
1303 }
1304 
1305 unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1306 {
1307  return bm_find_next(mdev, bm_fo, 0);
1308 }
1309 
1310 #if 0
1311 /* not yet needed for anything. */
1312 unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1313 {
1314  return bm_find_next(mdev, bm_fo, 1);
1315 }
1316 #endif
1317 
1318 /* does not spin_lock_irqsave.
1319  * you must take drbd_bm_lock() first */
1320 unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1321 {
1322  /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1323  return __bm_find_next(mdev, bm_fo, 0);
1324 }
1325 
1326 unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1327 {
1328  /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1329  return __bm_find_next(mdev, bm_fo, 1);
1330 }
1331 
1332 /* returns number of bits actually changed.
1333  * for val != 0, we change 0 -> 1, return code positive
1334  * for val == 0, we change 1 -> 0, return code negative
1335  * wants bitnr, not sector.
1336  * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1337  * Must hold bitmap lock already. */
1338 static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1339  unsigned long e, int val)
1340 {
1341  struct drbd_bitmap *b = mdev->bitmap;
1342  unsigned long *p_addr = NULL;
1343  unsigned long bitnr;
1344  unsigned int last_page_nr = -1U;
1345  int c = 0;
1346  int changed_total = 0;
1347 
1348  if (e >= b->bm_bits) {
1349  dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1350  s, e, b->bm_bits);
1351  e = b->bm_bits ? b->bm_bits -1 : 0;
1352  }
1353  for (bitnr = s; bitnr <= e; bitnr++) {
1354  unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1355  if (page_nr != last_page_nr) {
1356  if (p_addr)
1357  __bm_unmap(p_addr);
1358  if (c < 0)
1359  bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1360  else if (c > 0)
1361  bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1362  changed_total += c;
1363  c = 0;
1364  p_addr = __bm_map_pidx(b, page_nr);
1365  last_page_nr = page_nr;
1366  }
1367  if (val)
1368  c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1369  else
1370  c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1371  }
1372  if (p_addr)
1373  __bm_unmap(p_addr);
1374  if (c < 0)
1375  bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1376  else if (c > 0)
1377  bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1378  changed_total += c;
1379  b->bm_set += changed_total;
1380  return changed_total;
1381 }
1382 
1383 /* returns number of bits actually changed.
1384  * for val != 0, we change 0 -> 1, return code positive
1385  * for val == 0, we change 1 -> 0, return code negative
1386  * wants bitnr, not sector */
1387 static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1388  const unsigned long e, int val)
1389 {
1390  unsigned long flags;
1391  struct drbd_bitmap *b = mdev->bitmap;
1392  int c = 0;
1393 
1394  ERR_IF(!b) return 1;
1395  ERR_IF(!b->bm_pages) return 0;
1396 
1397  spin_lock_irqsave(&b->bm_lock, flags);
1398  if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1399  bm_print_lock_info(mdev);
1400 
1401  c = __bm_change_bits_to(mdev, s, e, val);
1402 
1403  spin_unlock_irqrestore(&b->bm_lock, flags);
1404  return c;
1405 }
1406 
1407 /* returns number of bits changed 0 -> 1 */
1408 int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1409 {
1410  return bm_change_bits_to(mdev, s, e, 1);
1411 }
1412 
1413 /* returns number of bits changed 1 -> 0 */
1414 int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1415 {
1416  return -bm_change_bits_to(mdev, s, e, 0);
1417 }
1418 
1419 /* sets all bits in full words,
1420  * from first_word up to, but not including, last_word */
1421 static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1422  int page_nr, int first_word, int last_word)
1423 {
1424  int i;
1425  int bits;
1426  unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
1427  for (i = first_word; i < last_word; i++) {
1428  bits = hweight_long(paddr[i]);
1429  paddr[i] = ~0UL;
1430  b->bm_set += BITS_PER_LONG - bits;
1431  }
1432  kunmap_atomic(paddr);
1433 }
1434 
1435 /* Same thing as drbd_bm_set_bits,
1436  * but more efficient for a large bit range.
1437  * You must first drbd_bm_lock().
1438  * Can be called to set the whole bitmap in one go.
1439  * Sets bits from s to e _inclusive_. */
1440 void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1441 {
1442  /* First set_bit from the first bit (s)
1443  * up to the next long boundary (sl),
1444  * then assign full words up to the last long boundary (el),
1445  * then set_bit up to and including the last bit (e).
1446  *
1447  * Do not use memset, because we must account for changes,
1448  * so we need to loop over the words with hweight() anyways.
1449  */
1450  struct drbd_bitmap *b = mdev->bitmap;
1451  unsigned long sl = ALIGN(s,BITS_PER_LONG);
1452  unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1453  int first_page;
1454  int last_page;
1455  int page_nr;
1456  int first_word;
1457  int last_word;
1458 
1459  if (e - s <= 3*BITS_PER_LONG) {
1460  /* don't bother; el and sl may even be wrong. */
1461  spin_lock_irq(&b->bm_lock);
1462  __bm_change_bits_to(mdev, s, e, 1);
1463  spin_unlock_irq(&b->bm_lock);
1464  return;
1465  }
1466 
1467  /* difference is large enough that we can trust sl and el */
1468 
1469  spin_lock_irq(&b->bm_lock);
1470 
1471  /* bits filling the current long */
1472  if (sl)
1473  __bm_change_bits_to(mdev, s, sl-1, 1);
1474 
1475  first_page = sl >> (3 + PAGE_SHIFT);
1476  last_page = el >> (3 + PAGE_SHIFT);
1477 
1478  /* MLPP: modulo longs per page */
1479  /* LWPP: long words per page */
1480  first_word = MLPP(sl >> LN2_BPL);
1481  last_word = LWPP;
1482 
1483  /* first and full pages, unless first page == last page */
1484  for (page_nr = first_page; page_nr < last_page; page_nr++) {
1485  bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1486  spin_unlock_irq(&b->bm_lock);
1487  cond_resched();
1488  first_word = 0;
1489  spin_lock_irq(&b->bm_lock);
1490  }
1491  /* last page (respectively only page, for first page == last page) */
1492  last_word = MLPP(el >> LN2_BPL);
1493 
1494  /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
1495  * ==> e = 32767, el = 32768, last_page = 2,
1496  * and now last_word = 0.
1497  * We do not want to touch last_page in this case,
1498  * as we did not allocate it, it is not present in bitmap->bm_pages.
1499  */
1500  if (last_word)
1501  bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1502 
1503  /* possibly trailing bits.
1504  * example: (e & 63) == 63, el will be e+1.
1505  * if that even was the very last bit,
1506  * it would trigger an assert in __bm_change_bits_to()
1507  */
1508  if (el <= e)
1509  __bm_change_bits_to(mdev, el, e, 1);
1510  spin_unlock_irq(&b->bm_lock);
1511 }
1512 
1513 /* returns bit state
1514  * wants bitnr, NOT sector.
1515  * inherently racy... area needs to be locked by means of {al,rs}_lru
1516  * 1 ... bit set
1517  * 0 ... bit not set
1518  * -1 ... first out of bounds access, stop testing for bits!
1519  */
1520 int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1521 {
1522  unsigned long flags;
1523  struct drbd_bitmap *b = mdev->bitmap;
1524  unsigned long *p_addr;
1525  int i;
1526 
1527  ERR_IF(!b) return 0;
1528  ERR_IF(!b->bm_pages) return 0;
1529 
1530  spin_lock_irqsave(&b->bm_lock, flags);
1531  if (BM_DONT_TEST & b->bm_flags)
1532  bm_print_lock_info(mdev);
1533  if (bitnr < b->bm_bits) {
1534  p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1535  i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1536  bm_unmap(p_addr);
1537  } else if (bitnr == b->bm_bits) {
1538  i = -1;
1539  } else { /* (bitnr > b->bm_bits) */
1540  dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1541  i = 0;
1542  }
1543 
1544  spin_unlock_irqrestore(&b->bm_lock, flags);
1545  return i;
1546 }
1547 
1548 /* returns number of bits set in the range [s, e] */
1549 int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1550 {
1551  unsigned long flags;
1552  struct drbd_bitmap *b = mdev->bitmap;
1553  unsigned long *p_addr = NULL;
1554  unsigned long bitnr;
1555  unsigned int page_nr = -1U;
1556  int c = 0;
1557 
1558  /* If this is called without a bitmap, that is a bug. But just to be
1559  * robust in case we screwed up elsewhere, in that case pretend there
1560  * was one dirty bit in the requested area, so we won't try to do a
1561  * local read there (no bitmap probably implies no disk) */
1562  ERR_IF(!b) return 1;
1563  ERR_IF(!b->bm_pages) return 1;
1564 
1565  spin_lock_irqsave(&b->bm_lock, flags);
1566  if (BM_DONT_TEST & b->bm_flags)
1567  bm_print_lock_info(mdev);
1568  for (bitnr = s; bitnr <= e; bitnr++) {
1569  unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1570  if (page_nr != idx) {
1571  page_nr = idx;
1572  if (p_addr)
1573  bm_unmap(p_addr);
1574  p_addr = bm_map_pidx(b, idx);
1575  }
1576  ERR_IF (bitnr >= b->bm_bits) {
1577  dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1578  } else {
1579  c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1580  }
1581  }
1582  if (p_addr)
1583  bm_unmap(p_addr);
1584  spin_unlock_irqrestore(&b->bm_lock, flags);
1585  return c;
1586 }
1587 
1588 
1589 /* inherently racy...
1590  * return value may be already out-of-date when this function returns.
1591  * but the general usage is that this is only use during a cstate when bits are
1592  * only cleared, not set, and typically only care for the case when the return
1593  * value is zero, or we already "locked" this "bitmap extent" by other means.
1594  *
1595  * enr is bm-extent number, since we chose to name one sector (512 bytes)
1596  * worth of the bitmap a "bitmap extent".
1597  *
1598  * TODO
1599  * I think since we use it like a reference count, we should use the real
1600  * reference count of some bitmap extent element from some lru instead...
1601  *
1602  */
1603 int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1604 {
1605  struct drbd_bitmap *b = mdev->bitmap;
1606  int count, s, e;
1607  unsigned long flags;
1608  unsigned long *p_addr, *bm;
1609 
1610  ERR_IF(!b) return 0;
1611  ERR_IF(!b->bm_pages) return 0;
1612 
1613  spin_lock_irqsave(&b->bm_lock, flags);
1614  if (BM_DONT_TEST & b->bm_flags)
1615  bm_print_lock_info(mdev);
1616 
1617  s = S2W(enr);
1618  e = min((size_t)S2W(enr+1), b->bm_words);
1619  count = 0;
1620  if (s < b->bm_words) {
1621  int n = e-s;
1622  p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1623  bm = p_addr + MLPP(s);
1624  while (n--)
1625  count += hweight_long(*bm++);
1626  bm_unmap(p_addr);
1627  } else {
1628  dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1629  }
1630  spin_unlock_irqrestore(&b->bm_lock, flags);
1631  return count;
1632 }
1633 
1634 /* Set all bits covered by the AL-extent al_enr.
1635  * Returns number of bits changed. */
1636 unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1637 {
1638  struct drbd_bitmap *b = mdev->bitmap;
1639  unsigned long *p_addr, *bm;
1640  unsigned long weight;
1641  unsigned long s, e;
1642  int count, i, do_now;
1643  ERR_IF(!b) return 0;
1644  ERR_IF(!b->bm_pages) return 0;
1645 
1646  spin_lock_irq(&b->bm_lock);
1647  if (BM_DONT_SET & b->bm_flags)
1648  bm_print_lock_info(mdev);
1649  weight = b->bm_set;
1650 
1651  s = al_enr * BM_WORDS_PER_AL_EXT;
1652  e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1653  /* assert that s and e are on the same page */
1654  D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1655  == s >> (PAGE_SHIFT - LN2_BPL + 3));
1656  count = 0;
1657  if (s < b->bm_words) {
1658  i = do_now = e-s;
1659  p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1660  bm = p_addr + MLPP(s);
1661  while (i--) {
1662  count += hweight_long(*bm);
1663  *bm = -1UL;
1664  bm++;
1665  }
1666  bm_unmap(p_addr);
1667  b->bm_set += do_now*BITS_PER_LONG - count;
1668  if (e == b->bm_words)
1669  b->bm_set -= bm_clear_surplus(b);
1670  } else {
1671  dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
1672  }
1673  weight = b->bm_set - weight;
1674  spin_unlock_irq(&b->bm_lock);
1675  return weight;
1676 }