Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dm-btree.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
10 
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13 
14 #define DM_MSG_PREFIX "btree"
15 
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
21 {
22  memcpy(dest, src, len);
24 }
25 
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27  unsigned index, void *elt)
29 {
30  if (index < nr_elts)
31  memmove(base + (elt_size * (index + 1)),
32  base + (elt_size * index),
33  (nr_elts - index) * elt_size);
34 
35  memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37 
38 /*----------------------------------------------------------------*/
39 
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct node *n, uint64_t key, int want_hi)
42 {
43  int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44 
45  while (hi - lo > 1) {
46  int mid = lo + ((hi - lo) / 2);
47  uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48 
49  if (mid_key == key)
50  return mid;
51 
52  if (mid_key < key)
53  lo = mid;
54  else
55  hi = mid;
56  }
57 
58  return want_hi ? hi : lo;
59 }
60 
61 int lower_bound(struct node *n, uint64_t key)
62 {
63  return bsearch(n, key, 0);
64 }
65 
66 void inc_children(struct dm_transaction_manager *tm, struct node *n,
67  struct dm_btree_value_type *vt)
68 {
69  unsigned i;
70  uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71 
72  if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73  for (i = 0; i < nr_entries; i++)
74  dm_tm_inc(tm, value64(n, i));
75  else if (vt->inc)
76  for (i = 0; i < nr_entries; i++)
77  vt->inc(vt->context, value_ptr(n, i));
78 }
79 
80 static int insert_at(size_t value_size, struct node *node, unsigned index,
81  uint64_t key, void *value)
83 {
84  uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85  __le64 key_le = cpu_to_le64(key);
86 
87  if (index > nr_entries ||
88  index >= le32_to_cpu(node->header.max_entries)) {
89  DMERR("too many entries in btree node for insert");
91  return -ENOMEM;
92  }
93 
94  __dm_bless_for_disk(&key_le);
95 
96  array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97  array_insert(value_base(node), value_size, nr_entries, index, value);
98  node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99 
100  return 0;
101 }
102 
103 /*----------------------------------------------------------------*/
104 
105 /*
106  * We want 3n entries (for some n). This works more nicely for repeated
107  * insert remove loops than (2n + 1).
108  */
109 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110 {
111  uint32_t total, n;
112  size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
113 
114  block_size -= sizeof(struct node_header);
115  total = block_size / elt_size;
116  n = total / 3; /* rounds down */
117 
118  return 3 * n;
119 }
120 
122 {
123  int r;
124  struct dm_block *b;
125  struct node *n;
126  size_t block_size;
128 
129  r = new_block(info, &b);
130  if (r < 0)
131  return r;
132 
133  block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134  max_entries = calc_max_entries(info->value_type.size, block_size);
135 
136  n = dm_block_data(b);
137  memset(n, 0, block_size);
138  n->header.flags = cpu_to_le32(LEAF_NODE);
139  n->header.nr_entries = cpu_to_le32(0);
140  n->header.max_entries = cpu_to_le32(max_entries);
141  n->header.value_size = cpu_to_le32(info->value_type.size);
142 
143  *root = dm_block_location(b);
144  return unlock_block(info, b);
145 }
147 
148 /*----------------------------------------------------------------*/
149 
150 /*
151  * Deletion uses a recursive algorithm, since we have limited stack space
152  * we explicitly manage our own stack on the heap.
153  */
154 #define MAX_SPINE_DEPTH 64
155 struct frame {
156  struct dm_block *b;
157  struct node *n;
158  unsigned level;
159  unsigned nr_children;
160  unsigned current_child;
161 };
162 
163 struct del_stack {
165  int top;
167 };
168 
169 static int top_frame(struct del_stack *s, struct frame **f)
170 {
171  if (s->top < 0) {
172  DMERR("btree deletion stack empty");
173  return -EINVAL;
174  }
175 
176  *f = s->spine + s->top;
177 
178  return 0;
179 }
180 
181 static int unprocessed_frames(struct del_stack *s)
182 {
183  return s->top >= 0;
184 }
185 
186 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187 {
188  int r;
189  uint32_t ref_count;
190 
191  if (s->top >= MAX_SPINE_DEPTH - 1) {
192  DMERR("btree deletion stack out of memory");
193  return -ENOMEM;
194  }
195 
196  r = dm_tm_ref(s->tm, b, &ref_count);
197  if (r)
198  return r;
199 
200  if (ref_count > 1)
201  /*
202  * This is a shared node, so we can just decrement it's
203  * reference counter and leave the children.
204  */
205  dm_tm_dec(s->tm, b);
206 
207  else {
208  struct frame *f = s->spine + ++s->top;
209 
210  r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
211  if (r) {
212  s->top--;
213  return r;
214  }
215 
216  f->n = dm_block_data(f->b);
217  f->level = level;
218  f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219  f->current_child = 0;
220  }
221 
222  return 0;
223 }
224 
225 static void pop_frame(struct del_stack *s)
226 {
227  struct frame *f = s->spine + s->top--;
228 
229  dm_tm_dec(s->tm, dm_block_location(f->b));
230  dm_tm_unlock(s->tm, f->b);
231 }
232 
234 {
235  int r;
236  struct del_stack *s;
237 
238  s = kmalloc(sizeof(*s), GFP_KERNEL);
239  if (!s)
240  return -ENOMEM;
241  s->tm = info->tm;
242  s->top = -1;
243 
244  r = push_frame(s, root, 1);
245  if (r)
246  goto out;
247 
248  while (unprocessed_frames(s)) {
249  uint32_t flags;
250  struct frame *f;
251  dm_block_t b;
252 
253  r = top_frame(s, &f);
254  if (r)
255  goto out;
256 
257  if (f->current_child >= f->nr_children) {
258  pop_frame(s);
259  continue;
260  }
261 
262  flags = le32_to_cpu(f->n->header.flags);
263  if (flags & INTERNAL_NODE) {
264  b = value64(f->n, f->current_child);
265  f->current_child++;
266  r = push_frame(s, b, f->level);
267  if (r)
268  goto out;
269 
270  } else if (f->level != (info->levels - 1)) {
271  b = value64(f->n, f->current_child);
272  f->current_child++;
273  r = push_frame(s, b, f->level + 1);
274  if (r)
275  goto out;
276 
277  } else {
278  if (info->value_type.dec) {
279  unsigned i;
280 
281  for (i = 0; i < f->nr_children; i++)
282  info->value_type.dec(info->value_type.context,
283  value_ptr(f->n, i));
284  }
285  f->current_child = f->nr_children;
286  }
287  }
288 
289 out:
290  kfree(s);
291  return r;
292 }
294 
295 /*----------------------------------------------------------------*/
296 
297 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
298  int (*search_fn)(struct node *, uint64_t),
299  uint64_t *result_key, void *v, size_t value_size)
300 {
301  int i, r;
302  uint32_t flags, nr_entries;
303 
304  do {
305  r = ro_step(s, block);
306  if (r < 0)
307  return r;
308 
309  i = search_fn(ro_node(s), key);
310 
311  flags = le32_to_cpu(ro_node(s)->header.flags);
312  nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
313  if (i < 0 || i >= nr_entries)
314  return -ENODATA;
315 
316  if (flags & INTERNAL_NODE)
317  block = value64(ro_node(s), i);
318 
319  } while (!(flags & LEAF_NODE));
320 
321  *result_key = le64_to_cpu(ro_node(s)->keys[i]);
322  memcpy(v, value_ptr(ro_node(s), i), value_size);
323 
324  return 0;
325 }
326 
328  uint64_t *keys, void *value_le)
329 {
330  unsigned level, last_level = info->levels - 1;
331  int r = -ENODATA;
332  uint64_t rkey;
333  __le64 internal_value_le;
334  struct ro_spine spine;
335 
336  init_ro_spine(&spine, info);
337  for (level = 0; level < info->levels; level++) {
338  size_t size;
339  void *value_p;
340 
341  if (level == last_level) {
342  value_p = value_le;
343  size = info->value_type.size;
344 
345  } else {
346  value_p = &internal_value_le;
347  size = sizeof(uint64_t);
348  }
349 
350  r = btree_lookup_raw(&spine, root, keys[level],
351  lower_bound, &rkey,
352  value_p, size);
353 
354  if (!r) {
355  if (rkey != keys[level]) {
356  exit_ro_spine(&spine);
357  return -ENODATA;
358  }
359  } else {
360  exit_ro_spine(&spine);
361  return r;
362  }
363 
364  root = le64_to_cpu(internal_value_le);
365  }
366  exit_ro_spine(&spine);
367 
368  return r;
369 }
371 
372 /*
373  * Splits a node by creating a sibling node and shifting half the nodes
374  * contents across. Assumes there is a parent node, and it has room for
375  * another child.
376  *
377  * Before:
378  * +--------+
379  * | Parent |
380  * +--------+
381  * |
382  * v
383  * +----------+
384  * | A ++++++ |
385  * +----------+
386  *
387  *
388  * After:
389  * +--------+
390  * | Parent |
391  * +--------+
392  * | |
393  * v +------+
394  * +---------+ |
395  * | A* +++ | v
396  * +---------+ +-------+
397  * | B +++ |
398  * +-------+
399  *
400  * Where A* is a shadow of A.
401  */
402 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
403  unsigned parent_index, uint64_t key)
404 {
405  int r;
406  size_t size;
407  unsigned nr_left, nr_right;
408  struct dm_block *left, *right, *parent;
409  struct node *ln, *rn, *pn;
411 
412  left = shadow_current(s);
413 
414  r = new_block(s->info, &right);
415  if (r < 0)
416  return r;
417 
418  ln = dm_block_data(left);
419  rn = dm_block_data(right);
420 
421  nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
422  nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
423 
424  ln->header.nr_entries = cpu_to_le32(nr_left);
425 
426  rn->header.flags = ln->header.flags;
427  rn->header.nr_entries = cpu_to_le32(nr_right);
428  rn->header.max_entries = ln->header.max_entries;
429  rn->header.value_size = ln->header.value_size;
430  memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
431 
432  size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
433  sizeof(uint64_t) : s->info->value_type.size;
434  memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
435  size * nr_right);
436 
437  /*
438  * Patch up the parent
439  */
440  parent = shadow_parent(s);
441 
442  pn = dm_block_data(parent);
443  location = cpu_to_le64(dm_block_location(left));
444  __dm_bless_for_disk(&location);
445  memcpy_disk(value_ptr(pn, parent_index),
446  &location, sizeof(__le64));
447 
448  location = cpu_to_le64(dm_block_location(right));
449  __dm_bless_for_disk(&location);
450 
451  r = insert_at(sizeof(__le64), pn, parent_index + 1,
452  le64_to_cpu(rn->keys[0]), &location);
453  if (r)
454  return r;
455 
456  if (key < le64_to_cpu(rn->keys[0])) {
457  unlock_block(s->info, right);
458  s->nodes[1] = left;
459  } else {
460  unlock_block(s->info, left);
461  s->nodes[1] = right;
462  }
463 
464  return 0;
465 }
466 
467 /*
468  * Splits a node by creating two new children beneath the given node.
469  *
470  * Before:
471  * +----------+
472  * | A ++++++ |
473  * +----------+
474  *
475  *
476  * After:
477  * +------------+
478  * | A (shadow) |
479  * +------------+
480  * | |
481  * +------+ +----+
482  * | |
483  * v v
484  * +-------+ +-------+
485  * | B +++ | | C +++ |
486  * +-------+ +-------+
487  */
488 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
489 {
490  int r;
491  size_t size;
492  unsigned nr_left, nr_right;
493  struct dm_block *left, *right, *new_parent;
494  struct node *pn, *ln, *rn;
495  __le64 val;
496 
497  new_parent = shadow_current(s);
498 
499  r = new_block(s->info, &left);
500  if (r < 0)
501  return r;
502 
503  r = new_block(s->info, &right);
504  if (r < 0) {
505  /* FIXME: put left */
506  return r;
507  }
508 
509  pn = dm_block_data(new_parent);
510  ln = dm_block_data(left);
511  rn = dm_block_data(right);
512 
513  nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
514  nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
515 
516  ln->header.flags = pn->header.flags;
517  ln->header.nr_entries = cpu_to_le32(nr_left);
518  ln->header.max_entries = pn->header.max_entries;
519  ln->header.value_size = pn->header.value_size;
520 
521  rn->header.flags = pn->header.flags;
522  rn->header.nr_entries = cpu_to_le32(nr_right);
523  rn->header.max_entries = pn->header.max_entries;
524  rn->header.value_size = pn->header.value_size;
525 
526  memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
527  memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
528 
529  size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
530  sizeof(__le64) : s->info->value_type.size;
531  memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
532  memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
533  nr_right * size);
534 
535  /* new_parent should just point to l and r now */
536  pn->header.flags = cpu_to_le32(INTERNAL_NODE);
537  pn->header.nr_entries = cpu_to_le32(2);
538  pn->header.max_entries = cpu_to_le32(
539  calc_max_entries(sizeof(__le64),
541  dm_tm_get_bm(s->info->tm))));
542  pn->header.value_size = cpu_to_le32(sizeof(__le64));
543 
544  val = cpu_to_le64(dm_block_location(left));
545  __dm_bless_for_disk(&val);
546  pn->keys[0] = ln->keys[0];
547  memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
548 
549  val = cpu_to_le64(dm_block_location(right));
550  __dm_bless_for_disk(&val);
551  pn->keys[1] = rn->keys[0];
552  memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
553 
554  /*
555  * rejig the spine. This is ugly, since it knows too
556  * much about the spine
557  */
558  if (s->nodes[0] != new_parent) {
559  unlock_block(s->info, s->nodes[0]);
560  s->nodes[0] = new_parent;
561  }
562  if (key < le64_to_cpu(rn->keys[0])) {
563  unlock_block(s->info, right);
564  s->nodes[1] = left;
565  } else {
566  unlock_block(s->info, left);
567  s->nodes[1] = right;
568  }
569  s->count = 2;
570 
571  return 0;
572 }
573 
574 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
575  struct dm_btree_value_type *vt,
576  uint64_t key, unsigned *index)
577 {
578  int r, i = *index, top = 1;
579  struct node *node;
580 
581  for (;;) {
582  r = shadow_step(s, root, vt);
583  if (r < 0)
584  return r;
585 
586  node = dm_block_data(shadow_current(s));
587 
588  /*
589  * We have to patch up the parent node, ugly, but I don't
590  * see a way to do this automatically as part of the spine
591  * op.
592  */
593  if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
595 
596  __dm_bless_for_disk(&location);
597  memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
598  &location, sizeof(__le64));
599  }
600 
601  node = dm_block_data(shadow_current(s));
602 
603  if (node->header.nr_entries == node->header.max_entries) {
604  if (top)
605  r = btree_split_beneath(s, key);
606  else
607  r = btree_split_sibling(s, root, i, key);
608 
609  if (r < 0)
610  return r;
611  }
612 
613  node = dm_block_data(shadow_current(s));
614 
615  i = lower_bound(node, key);
616 
617  if (le32_to_cpu(node->header.flags) & LEAF_NODE)
618  break;
619 
620  if (i < 0) {
621  /* change the bounds on the lowest key */
622  node->keys[0] = cpu_to_le64(key);
623  i = 0;
624  }
625 
626  root = value64(node, i);
627  top = 0;
628  }
629 
630  if (i < 0 || le64_to_cpu(node->keys[i]) != key)
631  i++;
632 
633  *index = i;
634  return 0;
635 }
636 
637 static int insert(struct dm_btree_info *info, dm_block_t root,
638  uint64_t *keys, void *value, dm_block_t *new_root,
639  int *inserted)
641 {
642  int r, need_insert;
643  unsigned level, index = -1, last_level = info->levels - 1;
644  dm_block_t block = root;
645  struct shadow_spine spine;
646  struct node *n;
647  struct dm_btree_value_type le64_type;
648 
649  le64_type.context = NULL;
650  le64_type.size = sizeof(__le64);
651  le64_type.inc = NULL;
652  le64_type.dec = NULL;
653  le64_type.equal = NULL;
654 
655  init_shadow_spine(&spine, info);
656 
657  for (level = 0; level < (info->levels - 1); level++) {
658  r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
659  if (r < 0)
660  goto bad;
661 
662  n = dm_block_data(shadow_current(&spine));
663  need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
664  (le64_to_cpu(n->keys[index]) != keys[level]));
665 
666  if (need_insert) {
667  dm_block_t new_tree;
668  __le64 new_le;
669 
670  r = dm_btree_empty(info, &new_tree);
671  if (r < 0)
672  goto bad;
673 
674  new_le = cpu_to_le64(new_tree);
675  __dm_bless_for_disk(&new_le);
676 
677  r = insert_at(sizeof(uint64_t), n, index,
678  keys[level], &new_le);
679  if (r)
680  goto bad;
681  }
682 
683  if (level < last_level)
684  block = value64(n, index);
685  }
686 
687  r = btree_insert_raw(&spine, block, &info->value_type,
688  keys[level], &index);
689  if (r < 0)
690  goto bad;
691 
692  n = dm_block_data(shadow_current(&spine));
693  need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
694  (le64_to_cpu(n->keys[index]) != keys[level]));
695 
696  if (need_insert) {
697  if (inserted)
698  *inserted = 1;
699 
700  r = insert_at(info->value_type.size, n, index,
701  keys[level], value);
702  if (r)
703  goto bad_unblessed;
704  } else {
705  if (inserted)
706  *inserted = 0;
707 
708  if (info->value_type.dec &&
709  (!info->value_type.equal ||
710  !info->value_type.equal(
711  info->value_type.context,
712  value_ptr(n, index),
713  value))) {
714  info->value_type.dec(info->value_type.context,
715  value_ptr(n, index));
716  }
717  memcpy_disk(value_ptr(n, index),
718  value, info->value_type.size);
719  }
720 
721  *new_root = shadow_root(&spine);
722  exit_shadow_spine(&spine);
723 
724  return 0;
725 
726 bad:
728 bad_unblessed:
729  exit_shadow_spine(&spine);
730  return r;
731 }
732 
734  uint64_t *keys, void *value, dm_block_t *new_root)
736 {
737  return insert(info, root, keys, value, new_root, NULL);
738 }
740 
742  uint64_t *keys, void *value, dm_block_t *new_root,
743  int *inserted)
745 {
746  return insert(info, root, keys, value, new_root, inserted);
747 }
749 
750 /*----------------------------------------------------------------*/
751 
752 static int find_highest_key(struct ro_spine *s, dm_block_t block,
753  uint64_t *result_key, dm_block_t *next_block)
754 {
755  int i, r;
756  uint32_t flags;
757 
758  do {
759  r = ro_step(s, block);
760  if (r < 0)
761  return r;
762 
763  flags = le32_to_cpu(ro_node(s)->header.flags);
764  i = le32_to_cpu(ro_node(s)->header.nr_entries);
765  if (!i)
766  return -ENODATA;
767  else
768  i--;
769 
770  *result_key = le64_to_cpu(ro_node(s)->keys[i]);
771  if (next_block || flags & INTERNAL_NODE)
772  block = value64(ro_node(s), i);
773 
774  } while (flags & INTERNAL_NODE);
775 
776  if (next_block)
777  *next_block = block;
778  return 0;
779 }
780 
782  uint64_t *result_keys)
783 {
784  int r = 0, count = 0, level;
785  struct ro_spine spine;
786 
787  init_ro_spine(&spine, info);
788  for (level = 0; level < info->levels; level++) {
789  r = find_highest_key(&spine, root, result_keys + level,
790  level == info->levels - 1 ? NULL : &root);
791  if (r == -ENODATA) {
792  r = 0;
793  break;
794 
795  } else if (r)
796  break;
797 
798  count++;
799  }
800  exit_ro_spine(&spine);
801 
802  return r ? r : count;
803 }