Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
extent-tree.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007 Oracle. All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 #undef SCRAMBLE_DELAYED_REFS
38 
39 /*
40  * control flags for do_chunk_alloc's force field
41  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
42  * if we really need one.
43  *
44  * CHUNK_ALLOC_LIMITED means to only try and allocate one
45  * if we have very few chunks already allocated. This is
46  * used as part of the clustering code to help make sure
47  * we have a good pool of storage to cluster in, without
48  * filling the FS with empty chunks
49  *
50  * CHUNK_ALLOC_FORCE means it must try to allocate one
51  *
52  */
53 enum {
57 };
58 
59 /*
60  * Control how reservations are dealt with.
61  *
62  * RESERVE_FREE - freeing a reservation.
63  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
64  * ENOSPC accounting
65  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
66  * bytes_may_use as the ENOSPC accounting is done elsewhere
67  */
68 enum {
72 };
73 
74 static int update_block_group(struct btrfs_trans_handle *trans,
75  struct btrfs_root *root,
76  u64 bytenr, u64 num_bytes, int alloc);
77 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
78  struct btrfs_root *root,
79  u64 bytenr, u64 num_bytes, u64 parent,
80  u64 root_objectid, u64 owner_objectid,
81  u64 owner_offset, int refs_to_drop,
82  struct btrfs_delayed_extent_op *extra_op);
83 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
84  struct extent_buffer *leaf,
85  struct btrfs_extent_item *ei);
86 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
87  struct btrfs_root *root,
88  u64 parent, u64 root_objectid,
90  struct btrfs_key *ins, int ref_mod);
91 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
92  struct btrfs_root *root,
93  u64 parent, u64 root_objectid,
94  u64 flags, struct btrfs_disk_key *key,
95  int level, struct btrfs_key *ins);
96 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98  int force);
99 static int find_next_key(struct btrfs_path *path, int level,
100  struct btrfs_key *key);
101 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
102  int dump_block_groups);
103 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
104  u64 num_bytes, int reserve);
105 
106 static noinline int
107 block_group_cache_done(struct btrfs_block_group_cache *cache)
108 {
109  smp_mb();
110  return cache->cached == BTRFS_CACHE_FINISHED;
111 }
112 
113 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
114 {
115  return (cache->flags & bits) == bits;
116 }
117 
118 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
119 {
120  atomic_inc(&cache->count);
121 }
122 
124 {
125  if (atomic_dec_and_test(&cache->count)) {
126  WARN_ON(cache->pinned > 0);
127  WARN_ON(cache->reserved > 0);
128  kfree(cache->free_space_ctl);
129  kfree(cache);
130  }
131 }
132 
133 /*
134  * this adds the block group to the fs_info rb tree for the block group
135  * cache
136  */
137 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
139 {
140  struct rb_node **p;
141  struct rb_node *parent = NULL;
143 
144  spin_lock(&info->block_group_cache_lock);
145  p = &info->block_group_cache_tree.rb_node;
146 
147  while (*p) {
148  parent = *p;
149  cache = rb_entry(parent, struct btrfs_block_group_cache,
150  cache_node);
151  if (block_group->key.objectid < cache->key.objectid) {
152  p = &(*p)->rb_left;
153  } else if (block_group->key.objectid > cache->key.objectid) {
154  p = &(*p)->rb_right;
155  } else {
156  spin_unlock(&info->block_group_cache_lock);
157  return -EEXIST;
158  }
159  }
160 
161  rb_link_node(&block_group->cache_node, parent, p);
162  rb_insert_color(&block_group->cache_node,
163  &info->block_group_cache_tree);
164  spin_unlock(&info->block_group_cache_lock);
165 
166  return 0;
167 }
168 
169 /*
170  * This will return the block group at or after bytenr if contains is 0, else
171  * it will return the block group that contains the bytenr
172  */
173 static struct btrfs_block_group_cache *
174 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
175  int contains)
176 {
178  struct rb_node *n;
179  u64 end, start;
180 
181  spin_lock(&info->block_group_cache_lock);
182  n = info->block_group_cache_tree.rb_node;
183 
184  while (n) {
185  cache = rb_entry(n, struct btrfs_block_group_cache,
186  cache_node);
187  end = cache->key.objectid + cache->key.offset - 1;
188  start = cache->key.objectid;
189 
190  if (bytenr < start) {
191  if (!contains && (!ret || start < ret->key.objectid))
192  ret = cache;
193  n = n->rb_left;
194  } else if (bytenr > start) {
195  if (contains && bytenr <= end) {
196  ret = cache;
197  break;
198  }
199  n = n->rb_right;
200  } else {
201  ret = cache;
202  break;
203  }
204  }
205  if (ret)
206  btrfs_get_block_group(ret);
207  spin_unlock(&info->block_group_cache_lock);
208 
209  return ret;
210 }
211 
212 static int add_excluded_extent(struct btrfs_root *root,
213  u64 start, u64 num_bytes)
214 {
215  u64 end = start + num_bytes - 1;
216  set_extent_bits(&root->fs_info->freed_extents[0],
217  start, end, EXTENT_UPTODATE, GFP_NOFS);
218  set_extent_bits(&root->fs_info->freed_extents[1],
219  start, end, EXTENT_UPTODATE, GFP_NOFS);
220  return 0;
221 }
222 
223 static void free_excluded_extents(struct btrfs_root *root,
224  struct btrfs_block_group_cache *cache)
225 {
226  u64 start, end;
227 
228  start = cache->key.objectid;
229  end = start + cache->key.offset - 1;
230 
231  clear_extent_bits(&root->fs_info->freed_extents[0],
232  start, end, EXTENT_UPTODATE, GFP_NOFS);
233  clear_extent_bits(&root->fs_info->freed_extents[1],
234  start, end, EXTENT_UPTODATE, GFP_NOFS);
235 }
236 
237 static int exclude_super_stripes(struct btrfs_root *root,
238  struct btrfs_block_group_cache *cache)
239 {
240  u64 bytenr;
241  u64 *logical;
242  int stripe_len;
243  int i, nr, ret;
244 
245  if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
246  stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
247  cache->bytes_super += stripe_len;
248  ret = add_excluded_extent(root, cache->key.objectid,
249  stripe_len);
250  BUG_ON(ret); /* -ENOMEM */
251  }
252 
253  for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
254  bytenr = btrfs_sb_offset(i);
255  ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
256  cache->key.objectid, bytenr,
257  0, &logical, &nr, &stripe_len);
258  BUG_ON(ret); /* -ENOMEM */
259 
260  while (nr--) {
261  cache->bytes_super += stripe_len;
262  ret = add_excluded_extent(root, logical[nr],
263  stripe_len);
264  BUG_ON(ret); /* -ENOMEM */
265  }
266 
267  kfree(logical);
268  }
269  return 0;
270 }
271 
272 static struct btrfs_caching_control *
273 get_caching_control(struct btrfs_block_group_cache *cache)
274 {
275  struct btrfs_caching_control *ctl;
276 
277  spin_lock(&cache->lock);
278  if (cache->cached != BTRFS_CACHE_STARTED) {
279  spin_unlock(&cache->lock);
280  return NULL;
281  }
282 
283  /* We're loading it the fast way, so we don't have a caching_ctl. */
284  if (!cache->caching_ctl) {
285  spin_unlock(&cache->lock);
286  return NULL;
287  }
288 
289  ctl = cache->caching_ctl;
290  atomic_inc(&ctl->count);
291  spin_unlock(&cache->lock);
292  return ctl;
293 }
294 
295 static void put_caching_control(struct btrfs_caching_control *ctl)
296 {
297  if (atomic_dec_and_test(&ctl->count))
298  kfree(ctl);
299 }
300 
301 /*
302  * this is only called by cache_block_group, since we could have freed extents
303  * we need to check the pinned_extents for any extents that can't be used yet
304  * since their free space will be released as soon as the transaction commits.
305  */
306 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
307  struct btrfs_fs_info *info, u64 start, u64 end)
308 {
309  u64 extent_start, extent_end, size, total_added = 0;
310  int ret;
311 
312  while (start < end) {
313  ret = find_first_extent_bit(info->pinned_extents, start,
314  &extent_start, &extent_end,
316  NULL);
317  if (ret)
318  break;
319 
320  if (extent_start <= start) {
321  start = extent_end + 1;
322  } else if (extent_start > start && extent_start < end) {
323  size = extent_start - start;
324  total_added += size;
325  ret = btrfs_add_free_space(block_group, start,
326  size);
327  BUG_ON(ret); /* -ENOMEM or logic error */
328  start = extent_end + 1;
329  } else {
330  break;
331  }
332  }
333 
334  if (start < end) {
335  size = end - start;
336  total_added += size;
337  ret = btrfs_add_free_space(block_group, start, size);
338  BUG_ON(ret); /* -ENOMEM or logic error */
339  }
340 
341  return total_added;
342 }
343 
344 static noinline void caching_thread(struct btrfs_work *work)
345 {
347  struct btrfs_fs_info *fs_info;
348  struct btrfs_caching_control *caching_ctl;
349  struct btrfs_root *extent_root;
350  struct btrfs_path *path;
351  struct extent_buffer *leaf;
352  struct btrfs_key key;
353  u64 total_found = 0;
354  u64 last = 0;
355  u32 nritems;
356  int ret = 0;
357 
358  caching_ctl = container_of(work, struct btrfs_caching_control, work);
359  block_group = caching_ctl->block_group;
360  fs_info = block_group->fs_info;
361  extent_root = fs_info->extent_root;
362 
363  path = btrfs_alloc_path();
364  if (!path)
365  goto out;
366 
367  last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
368 
369  /*
370  * We don't want to deadlock with somebody trying to allocate a new
371  * extent for the extent root while also trying to search the extent
372  * root to add free space. So we skip locking and search the commit
373  * root, since its read-only
374  */
375  path->skip_locking = 1;
376  path->search_commit_root = 1;
377  path->reada = 1;
378 
379  key.objectid = last;
380  key.offset = 0;
381  key.type = BTRFS_EXTENT_ITEM_KEY;
382 again:
383  mutex_lock(&caching_ctl->mutex);
384  /* need to make sure the commit_root doesn't disappear */
385  down_read(&fs_info->extent_commit_sem);
386 
387  ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
388  if (ret < 0)
389  goto err;
390 
391  leaf = path->nodes[0];
392  nritems = btrfs_header_nritems(leaf);
393 
394  while (1) {
395  if (btrfs_fs_closing(fs_info) > 1) {
396  last = (u64)-1;
397  break;
398  }
399 
400  if (path->slots[0] < nritems) {
401  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
402  } else {
403  ret = find_next_key(path, 0, &key);
404  if (ret)
405  break;
406 
407  if (need_resched() ||
408  btrfs_next_leaf(extent_root, path)) {
409  caching_ctl->progress = last;
410  btrfs_release_path(path);
411  up_read(&fs_info->extent_commit_sem);
412  mutex_unlock(&caching_ctl->mutex);
413  cond_resched();
414  goto again;
415  }
416  leaf = path->nodes[0];
417  nritems = btrfs_header_nritems(leaf);
418  continue;
419  }
420 
421  if (key.objectid < block_group->key.objectid) {
422  path->slots[0]++;
423  continue;
424  }
425 
426  if (key.objectid >= block_group->key.objectid +
427  block_group->key.offset)
428  break;
429 
430  if (key.type == BTRFS_EXTENT_ITEM_KEY) {
431  total_found += add_new_free_space(block_group,
432  fs_info, last,
433  key.objectid);
434  last = key.objectid + key.offset;
435 
436  if (total_found > (1024 * 1024 * 2)) {
437  total_found = 0;
438  wake_up(&caching_ctl->wait);
439  }
440  }
441  path->slots[0]++;
442  }
443  ret = 0;
444 
445  total_found += add_new_free_space(block_group, fs_info, last,
446  block_group->key.objectid +
447  block_group->key.offset);
448  caching_ctl->progress = (u64)-1;
449 
450  spin_lock(&block_group->lock);
451  block_group->caching_ctl = NULL;
452  block_group->cached = BTRFS_CACHE_FINISHED;
453  spin_unlock(&block_group->lock);
454 
455 err:
456  btrfs_free_path(path);
457  up_read(&fs_info->extent_commit_sem);
458 
459  free_excluded_extents(extent_root, block_group);
460 
461  mutex_unlock(&caching_ctl->mutex);
462 out:
463  wake_up(&caching_ctl->wait);
464 
465  put_caching_control(caching_ctl);
466  btrfs_put_block_group(block_group);
467 }
468 
469 static int cache_block_group(struct btrfs_block_group_cache *cache,
470  struct btrfs_trans_handle *trans,
471  struct btrfs_root *root,
472  int load_cache_only)
473 {
474  DEFINE_WAIT(wait);
475  struct btrfs_fs_info *fs_info = cache->fs_info;
476  struct btrfs_caching_control *caching_ctl;
477  int ret = 0;
478 
479  caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
480  if (!caching_ctl)
481  return -ENOMEM;
482 
483  INIT_LIST_HEAD(&caching_ctl->list);
484  mutex_init(&caching_ctl->mutex);
485  init_waitqueue_head(&caching_ctl->wait);
486  caching_ctl->block_group = cache;
487  caching_ctl->progress = cache->key.objectid;
488  atomic_set(&caching_ctl->count, 1);
489  caching_ctl->work.func = caching_thread;
490 
491  spin_lock(&cache->lock);
492  /*
493  * This should be a rare occasion, but this could happen I think in the
494  * case where one thread starts to load the space cache info, and then
495  * some other thread starts a transaction commit which tries to do an
496  * allocation while the other thread is still loading the space cache
497  * info. The previous loop should have kept us from choosing this block
498  * group, but if we've moved to the state where we will wait on caching
499  * block groups we need to first check if we're doing a fast load here,
500  * so we can wait for it to finish, otherwise we could end up allocating
501  * from a block group who's cache gets evicted for one reason or
502  * another.
503  */
504  while (cache->cached == BTRFS_CACHE_FAST) {
505  struct btrfs_caching_control *ctl;
506 
507  ctl = cache->caching_ctl;
508  atomic_inc(&ctl->count);
510  spin_unlock(&cache->lock);
511 
512  schedule();
513 
514  finish_wait(&ctl->wait, &wait);
515  put_caching_control(ctl);
516  spin_lock(&cache->lock);
517  }
518 
519  if (cache->cached != BTRFS_CACHE_NO) {
520  spin_unlock(&cache->lock);
521  kfree(caching_ctl);
522  return 0;
523  }
524  WARN_ON(cache->caching_ctl);
525  cache->caching_ctl = caching_ctl;
526  cache->cached = BTRFS_CACHE_FAST;
527  spin_unlock(&cache->lock);
528 
529  /*
530  * We can't do the read from on-disk cache during a commit since we need
531  * to have the normal tree locking. Also if we are currently trying to
532  * allocate blocks for the tree root we can't do the fast caching since
533  * we likely hold important locks.
534  */
535  if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
536  ret = load_free_space_cache(fs_info, cache);
537 
538  spin_lock(&cache->lock);
539  if (ret == 1) {
540  cache->caching_ctl = NULL;
541  cache->cached = BTRFS_CACHE_FINISHED;
542  cache->last_byte_to_unpin = (u64)-1;
543  } else {
544  if (load_cache_only) {
545  cache->caching_ctl = NULL;
546  cache->cached = BTRFS_CACHE_NO;
547  } else {
548  cache->cached = BTRFS_CACHE_STARTED;
549  }
550  }
551  spin_unlock(&cache->lock);
552  wake_up(&caching_ctl->wait);
553  if (ret == 1) {
554  put_caching_control(caching_ctl);
555  free_excluded_extents(fs_info->extent_root, cache);
556  return 0;
557  }
558  } else {
559  /*
560  * We are not going to do the fast caching, set cached to the
561  * appropriate value and wakeup any waiters.
562  */
563  spin_lock(&cache->lock);
564  if (load_cache_only) {
565  cache->caching_ctl = NULL;
566  cache->cached = BTRFS_CACHE_NO;
567  } else {
568  cache->cached = BTRFS_CACHE_STARTED;
569  }
570  spin_unlock(&cache->lock);
571  wake_up(&caching_ctl->wait);
572  }
573 
574  if (load_cache_only) {
575  put_caching_control(caching_ctl);
576  return 0;
577  }
578 
579  down_write(&fs_info->extent_commit_sem);
580  atomic_inc(&caching_ctl->count);
581  list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
582  up_write(&fs_info->extent_commit_sem);
583 
584  btrfs_get_block_group(cache);
585 
586  btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
587 
588  return ret;
589 }
590 
591 /*
592  * return the block group that starts at or after bytenr
593  */
594 static struct btrfs_block_group_cache *
595 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
596 {
598 
599  cache = block_group_cache_tree_search(info, bytenr, 0);
600 
601  return cache;
602 }
603 
604 /*
605  * return the block group that contains the given bytenr
606  */
608  struct btrfs_fs_info *info,
609  u64 bytenr)
610 {
612 
613  cache = block_group_cache_tree_search(info, bytenr, 1);
614 
615  return cache;
616 }
617 
618 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
619  u64 flags)
620 {
621  struct list_head *head = &info->space_info;
622  struct btrfs_space_info *found;
623 
625 
626  rcu_read_lock();
627  list_for_each_entry_rcu(found, head, list) {
628  if (found->flags & flags) {
629  rcu_read_unlock();
630  return found;
631  }
632  }
633  rcu_read_unlock();
634  return NULL;
635 }
636 
637 /*
638  * after adding space to the filesystem, we need to clear the full flags
639  * on all the space infos.
640  */
642 {
643  struct list_head *head = &info->space_info;
644  struct btrfs_space_info *found;
645 
646  rcu_read_lock();
647  list_for_each_entry_rcu(found, head, list)
648  found->full = 0;
649  rcu_read_unlock();
650 }
651 
652 static u64 div_factor(u64 num, int factor)
653 {
654  if (factor == 10)
655  return num;
656  num *= factor;
657  do_div(num, 10);
658  return num;
659 }
660 
661 static u64 div_factor_fine(u64 num, int factor)
662 {
663  if (factor == 100)
664  return num;
665  num *= factor;
666  do_div(num, 100);
667  return num;
668 }
669 
671  u64 search_start, u64 search_hint, int owner)
672 {
674  u64 used;
675  u64 last = max(search_hint, search_start);
676  u64 group_start = 0;
677  int full_search = 0;
678  int factor = 9;
679  int wrapped = 0;
680 again:
681  while (1) {
682  cache = btrfs_lookup_first_block_group(root->fs_info, last);
683  if (!cache)
684  break;
685 
686  spin_lock(&cache->lock);
687  last = cache->key.objectid + cache->key.offset;
688  used = btrfs_block_group_used(&cache->item);
689 
690  if ((full_search || !cache->ro) &&
691  block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
692  if (used + cache->pinned + cache->reserved <
693  div_factor(cache->key.offset, factor)) {
694  group_start = cache->key.objectid;
695  spin_unlock(&cache->lock);
696  btrfs_put_block_group(cache);
697  goto found;
698  }
699  }
700  spin_unlock(&cache->lock);
701  btrfs_put_block_group(cache);
702  cond_resched();
703  }
704  if (!wrapped) {
705  last = search_start;
706  wrapped = 1;
707  goto again;
708  }
709  if (!full_search && factor < 10) {
710  last = search_start;
711  full_search = 1;
712  factor = 10;
713  goto again;
714  }
715 found:
716  return group_start;
717 }
718 
719 /* simple helper to search for an existing extent at a given offset */
720 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
721 {
722  int ret;
723  struct btrfs_key key;
724  struct btrfs_path *path;
725 
726  path = btrfs_alloc_path();
727  if (!path)
728  return -ENOMEM;
729 
730  key.objectid = start;
731  key.offset = len;
732  btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
733  ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
734  0, 0);
735  btrfs_free_path(path);
736  return ret;
737 }
738 
739 /*
740  * helper function to lookup reference count and flags of extent.
741  *
742  * the head node for delayed ref is used to store the sum of all the
743  * reference count modifications queued up in the rbtree. the head
744  * node may also store the extent flags to set. This way you can check
745  * to see what the reference count and extent flags would be if all of
746  * the delayed refs are not processed.
747  */
749  struct btrfs_root *root, u64 bytenr,
750  u64 num_bytes, u64 *refs, u64 *flags)
751 {
753  struct btrfs_delayed_ref_root *delayed_refs;
754  struct btrfs_path *path;
755  struct btrfs_extent_item *ei;
756  struct extent_buffer *leaf;
757  struct btrfs_key key;
758  u32 item_size;
759  u64 num_refs;
760  u64 extent_flags;
761  int ret;
762 
763  path = btrfs_alloc_path();
764  if (!path)
765  return -ENOMEM;
766 
767  key.objectid = bytenr;
769  key.offset = num_bytes;
770  if (!trans) {
771  path->skip_locking = 1;
772  path->search_commit_root = 1;
773  }
774 again:
775  ret = btrfs_search_slot(trans, root->fs_info->extent_root,
776  &key, path, 0, 0);
777  if (ret < 0)
778  goto out_free;
779 
780  if (ret == 0) {
781  leaf = path->nodes[0];
782  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
783  if (item_size >= sizeof(*ei)) {
784  ei = btrfs_item_ptr(leaf, path->slots[0],
785  struct btrfs_extent_item);
786  num_refs = btrfs_extent_refs(leaf, ei);
787  extent_flags = btrfs_extent_flags(leaf, ei);
788  } else {
789 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
790  struct btrfs_extent_item_v0 *ei0;
791  BUG_ON(item_size != sizeof(*ei0));
792  ei0 = btrfs_item_ptr(leaf, path->slots[0],
793  struct btrfs_extent_item_v0);
794  num_refs = btrfs_extent_refs_v0(leaf, ei0);
795  /* FIXME: this isn't correct for data */
796  extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
797 #else
798  BUG();
799 #endif
800  }
801  BUG_ON(num_refs == 0);
802  } else {
803  num_refs = 0;
804  extent_flags = 0;
805  ret = 0;
806  }
807 
808  if (!trans)
809  goto out;
810 
811  delayed_refs = &trans->transaction->delayed_refs;
812  spin_lock(&delayed_refs->lock);
813  head = btrfs_find_delayed_ref_head(trans, bytenr);
814  if (head) {
815  if (!mutex_trylock(&head->mutex)) {
816  atomic_inc(&head->node.refs);
817  spin_unlock(&delayed_refs->lock);
818 
819  btrfs_release_path(path);
820 
821  /*
822  * Mutex was contended, block until it's released and try
823  * again
824  */
825  mutex_lock(&head->mutex);
826  mutex_unlock(&head->mutex);
827  btrfs_put_delayed_ref(&head->node);
828  goto again;
829  }
830  if (head->extent_op && head->extent_op->update_flags)
831  extent_flags |= head->extent_op->flags_to_set;
832  else
833  BUG_ON(num_refs == 0);
834 
835  num_refs += head->node.ref_mod;
836  mutex_unlock(&head->mutex);
837  }
838  spin_unlock(&delayed_refs->lock);
839 out:
840  WARN_ON(num_refs == 0);
841  if (refs)
842  *refs = num_refs;
843  if (flags)
844  *flags = extent_flags;
845 out_free:
846  btrfs_free_path(path);
847  return ret;
848 }
849 
850 /*
851  * Back reference rules. Back refs have three main goals:
852  *
853  * 1) differentiate between all holders of references to an extent so that
854  * when a reference is dropped we can make sure it was a valid reference
855  * before freeing the extent.
856  *
857  * 2) Provide enough information to quickly find the holders of an extent
858  * if we notice a given block is corrupted or bad.
859  *
860  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
861  * maintenance. This is actually the same as #2, but with a slightly
862  * different use case.
863  *
864  * There are two kinds of back refs. The implicit back refs is optimized
865  * for pointers in non-shared tree blocks. For a given pointer in a block,
866  * back refs of this kind provide information about the block's owner tree
867  * and the pointer's key. These information allow us to find the block by
868  * b-tree searching. The full back refs is for pointers in tree blocks not
869  * referenced by their owner trees. The location of tree block is recorded
870  * in the back refs. Actually the full back refs is generic, and can be
871  * used in all cases the implicit back refs is used. The major shortcoming
872  * of the full back refs is its overhead. Every time a tree block gets
873  * COWed, we have to update back refs entry for all pointers in it.
874  *
875  * For a newly allocated tree block, we use implicit back refs for
876  * pointers in it. This means most tree related operations only involve
877  * implicit back refs. For a tree block created in old transaction, the
878  * only way to drop a reference to it is COW it. So we can detect the
879  * event that tree block loses its owner tree's reference and do the
880  * back refs conversion.
881  *
882  * When a tree block is COW'd through a tree, there are four cases:
883  *
884  * The reference count of the block is one and the tree is the block's
885  * owner tree. Nothing to do in this case.
886  *
887  * The reference count of the block is one and the tree is not the
888  * block's owner tree. In this case, full back refs is used for pointers
889  * in the block. Remove these full back refs, add implicit back refs for
890  * every pointers in the new block.
891  *
892  * The reference count of the block is greater than one and the tree is
893  * the block's owner tree. In this case, implicit back refs is used for
894  * pointers in the block. Add full back refs for every pointers in the
895  * block, increase lower level extents' reference counts. The original
896  * implicit back refs are entailed to the new block.
897  *
898  * The reference count of the block is greater than one and the tree is
899  * not the block's owner tree. Add implicit back refs for every pointer in
900  * the new block, increase lower level extents' reference count.
901  *
902  * Back Reference Key composing:
903  *
904  * The key objectid corresponds to the first byte in the extent,
905  * The key type is used to differentiate between types of back refs.
906  * There are different meanings of the key offset for different types
907  * of back refs.
908  *
909  * File extents can be referenced by:
910  *
911  * - multiple snapshots, subvolumes, or different generations in one subvol
912  * - different files inside a single subvolume
913  * - different offsets inside a file (bookend extents in file.c)
914  *
915  * The extent ref structure for the implicit back refs has fields for:
916  *
917  * - Objectid of the subvolume root
918  * - objectid of the file holding the reference
919  * - original offset in the file
920  * - how many bookend extents
921  *
922  * The key offset for the implicit back refs is hash of the first
923  * three fields.
924  *
925  * The extent ref structure for the full back refs has field for:
926  *
927  * - number of pointers in the tree leaf
928  *
929  * The key offset for the implicit back refs is the first byte of
930  * the tree leaf
931  *
932  * When a file extent is allocated, The implicit back refs is used.
933  * the fields are filled in:
934  *
935  * (root_key.objectid, inode objectid, offset in file, 1)
936  *
937  * When a file extent is removed file truncation, we find the
938  * corresponding implicit back refs and check the following fields:
939  *
940  * (btrfs_header_owner(leaf), inode objectid, offset in file)
941  *
942  * Btree extents can be referenced by:
943  *
944  * - Different subvolumes
945  *
946  * Both the implicit back refs and the full back refs for tree blocks
947  * only consist of key. The key offset for the implicit back refs is
948  * objectid of block's owner tree. The key offset for the full back refs
949  * is the first byte of parent block.
950  *
951  * When implicit back refs is used, information about the lowest key and
952  * level of the tree block are required. These information are stored in
953  * tree block info structure.
954  */
955 
956 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
957 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
958  struct btrfs_root *root,
959  struct btrfs_path *path,
961 {
962  struct btrfs_extent_item *item;
963  struct btrfs_extent_item_v0 *ei0;
964  struct btrfs_extent_ref_v0 *ref0;
965  struct btrfs_tree_block_info *bi;
966  struct extent_buffer *leaf;
967  struct btrfs_key key;
968  struct btrfs_key found_key;
969  u32 new_size = sizeof(*item);
970  u64 refs;
971  int ret;
972 
973  leaf = path->nodes[0];
974  BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
975 
976  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
977  ei0 = btrfs_item_ptr(leaf, path->slots[0],
978  struct btrfs_extent_item_v0);
979  refs = btrfs_extent_refs_v0(leaf, ei0);
980 
981  if (owner == (u64)-1) {
982  while (1) {
983  if (path->slots[0] >= btrfs_header_nritems(leaf)) {
984  ret = btrfs_next_leaf(root, path);
985  if (ret < 0)
986  return ret;
987  BUG_ON(ret > 0); /* Corruption */
988  leaf = path->nodes[0];
989  }
990  btrfs_item_key_to_cpu(leaf, &found_key,
991  path->slots[0]);
992  BUG_ON(key.objectid != found_key.objectid);
993  if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
994  path->slots[0]++;
995  continue;
996  }
997  ref0 = btrfs_item_ptr(leaf, path->slots[0],
998  struct btrfs_extent_ref_v0);
999  owner = btrfs_ref_objectid_v0(leaf, ref0);
1000  break;
1001  }
1002  }
1003  btrfs_release_path(path);
1004 
1005  if (owner < BTRFS_FIRST_FREE_OBJECTID)
1006  new_size += sizeof(*bi);
1007 
1008  new_size -= sizeof(*ei0);
1009  ret = btrfs_search_slot(trans, root, &key, path,
1010  new_size + extra_size, 1);
1011  if (ret < 0)
1012  return ret;
1013  BUG_ON(ret); /* Corruption */
1014 
1015  btrfs_extend_item(trans, root, path, new_size);
1016 
1017  leaf = path->nodes[0];
1018  item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1019  btrfs_set_extent_refs(leaf, item, refs);
1020  /* FIXME: get real generation */
1021  btrfs_set_extent_generation(leaf, item, 0);
1022  if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1023  btrfs_set_extent_flags(leaf, item,
1026  bi = (struct btrfs_tree_block_info *)(item + 1);
1027  /* FIXME: get first key of the block */
1028  memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1029  btrfs_set_tree_block_level(leaf, bi, (int)owner);
1030  } else {
1031  btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1032  }
1034  return 0;
1035 }
1036 #endif
1037 
1038 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1039 {
1040  u32 high_crc = ~(u32)0;
1041  u32 low_crc = ~(u32)0;
1042  __le64 lenum;
1043 
1044  lenum = cpu_to_le64(root_objectid);
1045  high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1046  lenum = cpu_to_le64(owner);
1047  low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1048  lenum = cpu_to_le64(offset);
1049  low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1050 
1051  return ((u64)high_crc << 31) ^ (u64)low_crc;
1052 }
1053 
1054 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1055  struct btrfs_extent_data_ref *ref)
1056 {
1057  return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1058  btrfs_extent_data_ref_objectid(leaf, ref),
1059  btrfs_extent_data_ref_offset(leaf, ref));
1060 }
1061 
1062 static int match_extent_data_ref(struct extent_buffer *leaf,
1063  struct btrfs_extent_data_ref *ref,
1064  u64 root_objectid, u64 owner, u64 offset)
1065 {
1066  if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1067  btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1068  btrfs_extent_data_ref_offset(leaf, ref) != offset)
1069  return 0;
1070  return 1;
1071 }
1072 
1073 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1074  struct btrfs_root *root,
1075  struct btrfs_path *path,
1076  u64 bytenr, u64 parent,
1077  u64 root_objectid,
1078  u64 owner, u64 offset)
1079 {
1080  struct btrfs_key key;
1081  struct btrfs_extent_data_ref *ref;
1082  struct extent_buffer *leaf;
1083  u32 nritems;
1084  int ret;
1085  int recow;
1086  int err = -ENOENT;
1087 
1088  key.objectid = bytenr;
1089  if (parent) {
1091  key.offset = parent;
1092  } else {
1094  key.offset = hash_extent_data_ref(root_objectid,
1095  owner, offset);
1096  }
1097 again:
1098  recow = 0;
1099  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1100  if (ret < 0) {
1101  err = ret;
1102  goto fail;
1103  }
1104 
1105  if (parent) {
1106  if (!ret)
1107  return 0;
1108 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1110  btrfs_release_path(path);
1111  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1112  if (ret < 0) {
1113  err = ret;
1114  goto fail;
1115  }
1116  if (!ret)
1117  return 0;
1118 #endif
1119  goto fail;
1120  }
1121 
1122  leaf = path->nodes[0];
1123  nritems = btrfs_header_nritems(leaf);
1124  while (1) {
1125  if (path->slots[0] >= nritems) {
1126  ret = btrfs_next_leaf(root, path);
1127  if (ret < 0)
1128  err = ret;
1129  if (ret)
1130  goto fail;
1131 
1132  leaf = path->nodes[0];
1133  nritems = btrfs_header_nritems(leaf);
1134  recow = 1;
1135  }
1136 
1137  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1138  if (key.objectid != bytenr ||
1140  goto fail;
1141 
1142  ref = btrfs_item_ptr(leaf, path->slots[0],
1143  struct btrfs_extent_data_ref);
1144 
1145  if (match_extent_data_ref(leaf, ref, root_objectid,
1146  owner, offset)) {
1147  if (recow) {
1148  btrfs_release_path(path);
1149  goto again;
1150  }
1151  err = 0;
1152  break;
1153  }
1154  path->slots[0]++;
1155  }
1156 fail:
1157  return err;
1158 }
1159 
1160 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1161  struct btrfs_root *root,
1162  struct btrfs_path *path,
1163  u64 bytenr, u64 parent,
1164  u64 root_objectid, u64 owner,
1165  u64 offset, int refs_to_add)
1166 {
1167  struct btrfs_key key;
1168  struct extent_buffer *leaf;
1169  u32 size;
1170  u32 num_refs;
1171  int ret;
1172 
1173  key.objectid = bytenr;
1174  if (parent) {
1176  key.offset = parent;
1177  size = sizeof(struct btrfs_shared_data_ref);
1178  } else {
1180  key.offset = hash_extent_data_ref(root_objectid,
1181  owner, offset);
1182  size = sizeof(struct btrfs_extent_data_ref);
1183  }
1184 
1185  ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1186  if (ret && ret != -EEXIST)
1187  goto fail;
1188 
1189  leaf = path->nodes[0];
1190  if (parent) {
1191  struct btrfs_shared_data_ref *ref;
1192  ref = btrfs_item_ptr(leaf, path->slots[0],
1193  struct btrfs_shared_data_ref);
1194  if (ret == 0) {
1195  btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1196  } else {
1197  num_refs = btrfs_shared_data_ref_count(leaf, ref);
1198  num_refs += refs_to_add;
1199  btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1200  }
1201  } else {
1202  struct btrfs_extent_data_ref *ref;
1203  while (ret == -EEXIST) {
1204  ref = btrfs_item_ptr(leaf, path->slots[0],
1205  struct btrfs_extent_data_ref);
1206  if (match_extent_data_ref(leaf, ref, root_objectid,
1207  owner, offset))
1208  break;
1209  btrfs_release_path(path);
1210  key.offset++;
1211  ret = btrfs_insert_empty_item(trans, root, path, &key,
1212  size);
1213  if (ret && ret != -EEXIST)
1214  goto fail;
1215 
1216  leaf = path->nodes[0];
1217  }
1218  ref = btrfs_item_ptr(leaf, path->slots[0],
1219  struct btrfs_extent_data_ref);
1220  if (ret == 0) {
1221  btrfs_set_extent_data_ref_root(leaf, ref,
1222  root_objectid);
1223  btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1224  btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1225  btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1226  } else {
1227  num_refs = btrfs_extent_data_ref_count(leaf, ref);
1228  num_refs += refs_to_add;
1229  btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1230  }
1231  }
1233  ret = 0;
1234 fail:
1235  btrfs_release_path(path);
1236  return ret;
1237 }
1238 
1239 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1240  struct btrfs_root *root,
1241  struct btrfs_path *path,
1242  int refs_to_drop)
1243 {
1244  struct btrfs_key key;
1245  struct btrfs_extent_data_ref *ref1 = NULL;
1246  struct btrfs_shared_data_ref *ref2 = NULL;
1247  struct extent_buffer *leaf;
1248  u32 num_refs = 0;
1249  int ret = 0;
1250 
1251  leaf = path->nodes[0];
1252  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1253 
1254  if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1255  ref1 = btrfs_item_ptr(leaf, path->slots[0],
1256  struct btrfs_extent_data_ref);
1257  num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1258  } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1259  ref2 = btrfs_item_ptr(leaf, path->slots[0],
1260  struct btrfs_shared_data_ref);
1261  num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1262 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1263  } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1264  struct btrfs_extent_ref_v0 *ref0;
1265  ref0 = btrfs_item_ptr(leaf, path->slots[0],
1266  struct btrfs_extent_ref_v0);
1267  num_refs = btrfs_ref_count_v0(leaf, ref0);
1268 #endif
1269  } else {
1270  BUG();
1271  }
1272 
1273  BUG_ON(num_refs < refs_to_drop);
1274  num_refs -= refs_to_drop;
1275 
1276  if (num_refs == 0) {
1277  ret = btrfs_del_item(trans, root, path);
1278  } else {
1279  if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1280  btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1281  else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1282  btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1283 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1284  else {
1285  struct btrfs_extent_ref_v0 *ref0;
1286  ref0 = btrfs_item_ptr(leaf, path->slots[0],
1287  struct btrfs_extent_ref_v0);
1288  btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1289  }
1290 #endif
1292  }
1293  return ret;
1294 }
1295 
1296 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1297  struct btrfs_path *path,
1298  struct btrfs_extent_inline_ref *iref)
1299 {
1300  struct btrfs_key key;
1301  struct extent_buffer *leaf;
1302  struct btrfs_extent_data_ref *ref1;
1303  struct btrfs_shared_data_ref *ref2;
1304  u32 num_refs = 0;
1305 
1306  leaf = path->nodes[0];
1307  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1308  if (iref) {
1309  if (btrfs_extent_inline_ref_type(leaf, iref) ==
1311  ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1312  num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1313  } else {
1314  ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1315  num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1316  }
1317  } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1318  ref1 = btrfs_item_ptr(leaf, path->slots[0],
1319  struct btrfs_extent_data_ref);
1320  num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1321  } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1322  ref2 = btrfs_item_ptr(leaf, path->slots[0],
1323  struct btrfs_shared_data_ref);
1324  num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1325 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1326  } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1327  struct btrfs_extent_ref_v0 *ref0;
1328  ref0 = btrfs_item_ptr(leaf, path->slots[0],
1329  struct btrfs_extent_ref_v0);
1330  num_refs = btrfs_ref_count_v0(leaf, ref0);
1331 #endif
1332  } else {
1333  WARN_ON(1);
1334  }
1335  return num_refs;
1336 }
1337 
1338 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1339  struct btrfs_root *root,
1340  struct btrfs_path *path,
1341  u64 bytenr, u64 parent,
1342  u64 root_objectid)
1343 {
1344  struct btrfs_key key;
1345  int ret;
1346 
1347  key.objectid = bytenr;
1348  if (parent) {
1350  key.offset = parent;
1351  } else {
1353  key.offset = root_objectid;
1354  }
1355 
1356  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1357  if (ret > 0)
1358  ret = -ENOENT;
1359 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1360  if (ret == -ENOENT && parent) {
1361  btrfs_release_path(path);
1363  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1364  if (ret > 0)
1365  ret = -ENOENT;
1366  }
1367 #endif
1368  return ret;
1369 }
1370 
1371 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1372  struct btrfs_root *root,
1373  struct btrfs_path *path,
1374  u64 bytenr, u64 parent,
1375  u64 root_objectid)
1376 {
1377  struct btrfs_key key;
1378  int ret;
1379 
1380  key.objectid = bytenr;
1381  if (parent) {
1383  key.offset = parent;
1384  } else {
1386  key.offset = root_objectid;
1387  }
1388 
1389  ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1390  btrfs_release_path(path);
1391  return ret;
1392 }
1393 
1394 static inline int extent_ref_type(u64 parent, u64 owner)
1395 {
1396  int type;
1397  if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1398  if (parent > 0)
1400  else
1401  type = BTRFS_TREE_BLOCK_REF_KEY;
1402  } else {
1403  if (parent > 0)
1405  else
1407  }
1408  return type;
1409 }
1410 
1411 static int find_next_key(struct btrfs_path *path, int level,
1412  struct btrfs_key *key)
1413 
1414 {
1415  for (; level < BTRFS_MAX_LEVEL; level++) {
1416  if (!path->nodes[level])
1417  break;
1418  if (path->slots[level] + 1 >=
1419  btrfs_header_nritems(path->nodes[level]))
1420  continue;
1421  if (level == 0)
1422  btrfs_item_key_to_cpu(path->nodes[level], key,
1423  path->slots[level] + 1);
1424  else
1425  btrfs_node_key_to_cpu(path->nodes[level], key,
1426  path->slots[level] + 1);
1427  return 0;
1428  }
1429  return 1;
1430 }
1431 
1432 /*
1433  * look for inline back ref. if back ref is found, *ref_ret is set
1434  * to the address of inline back ref, and 0 is returned.
1435  *
1436  * if back ref isn't found, *ref_ret is set to the address where it
1437  * should be inserted, and -ENOENT is returned.
1438  *
1439  * if insert is true and there are too many inline back refs, the path
1440  * points to the extent item, and -EAGAIN is returned.
1441  *
1442  * NOTE: inline back refs are ordered in the same way that back ref
1443  * items in the tree are ordered.
1444  */
1445 static noinline_for_stack
1446 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1447  struct btrfs_root *root,
1448  struct btrfs_path *path,
1449  struct btrfs_extent_inline_ref **ref_ret,
1450  u64 bytenr, u64 num_bytes,
1451  u64 parent, u64 root_objectid,
1452  u64 owner, u64 offset, int insert)
1453 {
1454  struct btrfs_key key;
1455  struct extent_buffer *leaf;
1456  struct btrfs_extent_item *ei;
1457  struct btrfs_extent_inline_ref *iref;
1458  u64 flags;
1459  u64 item_size;
1460  unsigned long ptr;
1461  unsigned long end;
1462  int extra_size;
1463  int type;
1464  int want;
1465  int ret;
1466  int err = 0;
1467 
1468  key.objectid = bytenr;
1470  key.offset = num_bytes;
1471 
1472  want = extent_ref_type(parent, owner);
1473  if (insert) {
1474  extra_size = btrfs_extent_inline_ref_size(want);
1475  path->keep_locks = 1;
1476  } else
1477  extra_size = -1;
1478  ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1479  if (ret < 0) {
1480  err = ret;
1481  goto out;
1482  }
1483  if (ret && !insert) {
1484  err = -ENOENT;
1485  goto out;
1486  }
1487  BUG_ON(ret); /* Corruption */
1488 
1489  leaf = path->nodes[0];
1490  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1491 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1492  if (item_size < sizeof(*ei)) {
1493  if (!insert) {
1494  err = -ENOENT;
1495  goto out;
1496  }
1497  ret = convert_extent_item_v0(trans, root, path, owner,
1498  extra_size);
1499  if (ret < 0) {
1500  err = ret;
1501  goto out;
1502  }
1503  leaf = path->nodes[0];
1504  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1505  }
1506 #endif
1507  BUG_ON(item_size < sizeof(*ei));
1508 
1509  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1510  flags = btrfs_extent_flags(leaf, ei);
1511 
1512  ptr = (unsigned long)(ei + 1);
1513  end = (unsigned long)ei + item_size;
1514 
1515  if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1516  ptr += sizeof(struct btrfs_tree_block_info);
1517  BUG_ON(ptr > end);
1518  } else {
1519  BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1520  }
1521 
1522  err = -ENOENT;
1523  while (1) {
1524  if (ptr >= end) {
1525  WARN_ON(ptr > end);
1526  break;
1527  }
1528  iref = (struct btrfs_extent_inline_ref *)ptr;
1529  type = btrfs_extent_inline_ref_type(leaf, iref);
1530  if (want < type)
1531  break;
1532  if (want > type) {
1533  ptr += btrfs_extent_inline_ref_size(type);
1534  continue;
1535  }
1536 
1537  if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1538  struct btrfs_extent_data_ref *dref;
1539  dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1540  if (match_extent_data_ref(leaf, dref, root_objectid,
1541  owner, offset)) {
1542  err = 0;
1543  break;
1544  }
1545  if (hash_extent_data_ref_item(leaf, dref) <
1546  hash_extent_data_ref(root_objectid, owner, offset))
1547  break;
1548  } else {
1549  u64 ref_offset;
1550  ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1551  if (parent > 0) {
1552  if (parent == ref_offset) {
1553  err = 0;
1554  break;
1555  }
1556  if (ref_offset < parent)
1557  break;
1558  } else {
1559  if (root_objectid == ref_offset) {
1560  err = 0;
1561  break;
1562  }
1563  if (ref_offset < root_objectid)
1564  break;
1565  }
1566  }
1567  ptr += btrfs_extent_inline_ref_size(type);
1568  }
1569  if (err == -ENOENT && insert) {
1570  if (item_size + extra_size >=
1572  err = -EAGAIN;
1573  goto out;
1574  }
1575  /*
1576  * To add new inline back ref, we have to make sure
1577  * there is no corresponding back ref item.
1578  * For simplicity, we just do not add new inline back
1579  * ref if there is any kind of item for this block
1580  */
1581  if (find_next_key(path, 0, &key) == 0 &&
1582  key.objectid == bytenr &&
1584  err = -EAGAIN;
1585  goto out;
1586  }
1587  }
1588  *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1589 out:
1590  if (insert) {
1591  path->keep_locks = 0;
1592  btrfs_unlock_up_safe(path, 1);
1593  }
1594  return err;
1595 }
1596 
1597 /*
1598  * helper to add new inline back ref
1599  */
1600 static noinline_for_stack
1601 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1602  struct btrfs_root *root,
1603  struct btrfs_path *path,
1604  struct btrfs_extent_inline_ref *iref,
1605  u64 parent, u64 root_objectid,
1606  u64 owner, u64 offset, int refs_to_add,
1607  struct btrfs_delayed_extent_op *extent_op)
1608 {
1609  struct extent_buffer *leaf;
1610  struct btrfs_extent_item *ei;
1611  unsigned long ptr;
1612  unsigned long end;
1613  unsigned long item_offset;
1614  u64 refs;
1615  int size;
1616  int type;
1617 
1618  leaf = path->nodes[0];
1619  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1620  item_offset = (unsigned long)iref - (unsigned long)ei;
1621 
1622  type = extent_ref_type(parent, owner);
1623  size = btrfs_extent_inline_ref_size(type);
1624 
1625  btrfs_extend_item(trans, root, path, size);
1626 
1627  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1628  refs = btrfs_extent_refs(leaf, ei);
1629  refs += refs_to_add;
1630  btrfs_set_extent_refs(leaf, ei, refs);
1631  if (extent_op)
1632  __run_delayed_extent_op(extent_op, leaf, ei);
1633 
1634  ptr = (unsigned long)ei + item_offset;
1635  end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1636  if (ptr < end - size)
1637  memmove_extent_buffer(leaf, ptr + size, ptr,
1638  end - size - ptr);
1639 
1640  iref = (struct btrfs_extent_inline_ref *)ptr;
1641  btrfs_set_extent_inline_ref_type(leaf, iref, type);
1642  if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1643  struct btrfs_extent_data_ref *dref;
1644  dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1645  btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1646  btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1647  btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1648  btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1649  } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1650  struct btrfs_shared_data_ref *sref;
1651  sref = (struct btrfs_shared_data_ref *)(iref + 1);
1652  btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1653  btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1654  } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1655  btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1656  } else {
1657  btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1658  }
1660 }
1661 
1662 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1663  struct btrfs_root *root,
1664  struct btrfs_path *path,
1665  struct btrfs_extent_inline_ref **ref_ret,
1666  u64 bytenr, u64 num_bytes, u64 parent,
1667  u64 root_objectid, u64 owner, u64 offset)
1668 {
1669  int ret;
1670 
1671  ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1672  bytenr, num_bytes, parent,
1673  root_objectid, owner, offset, 0);
1674  if (ret != -ENOENT)
1675  return ret;
1676 
1677  btrfs_release_path(path);
1678  *ref_ret = NULL;
1679 
1680  if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1681  ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1682  root_objectid);
1683  } else {
1684  ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1685  root_objectid, owner, offset);
1686  }
1687  return ret;
1688 }
1689 
1690 /*
1691  * helper to update/remove inline back ref
1692  */
1693 static noinline_for_stack
1694 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1695  struct btrfs_root *root,
1696  struct btrfs_path *path,
1697  struct btrfs_extent_inline_ref *iref,
1698  int refs_to_mod,
1699  struct btrfs_delayed_extent_op *extent_op)
1700 {
1701  struct extent_buffer *leaf;
1702  struct btrfs_extent_item *ei;
1703  struct btrfs_extent_data_ref *dref = NULL;
1704  struct btrfs_shared_data_ref *sref = NULL;
1705  unsigned long ptr;
1706  unsigned long end;
1707  u32 item_size;
1708  int size;
1709  int type;
1710  u64 refs;
1711 
1712  leaf = path->nodes[0];
1713  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1714  refs = btrfs_extent_refs(leaf, ei);
1715  WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1716  refs += refs_to_mod;
1717  btrfs_set_extent_refs(leaf, ei, refs);
1718  if (extent_op)
1719  __run_delayed_extent_op(extent_op, leaf, ei);
1720 
1721  type = btrfs_extent_inline_ref_type(leaf, iref);
1722 
1723  if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1724  dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1725  refs = btrfs_extent_data_ref_count(leaf, dref);
1726  } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1727  sref = (struct btrfs_shared_data_ref *)(iref + 1);
1728  refs = btrfs_shared_data_ref_count(leaf, sref);
1729  } else {
1730  refs = 1;
1731  BUG_ON(refs_to_mod != -1);
1732  }
1733 
1734  BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1735  refs += refs_to_mod;
1736 
1737  if (refs > 0) {
1738  if (type == BTRFS_EXTENT_DATA_REF_KEY)
1739  btrfs_set_extent_data_ref_count(leaf, dref, refs);
1740  else
1741  btrfs_set_shared_data_ref_count(leaf, sref, refs);
1742  } else {
1743  size = btrfs_extent_inline_ref_size(type);
1744  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1745  ptr = (unsigned long)iref;
1746  end = (unsigned long)ei + item_size;
1747  if (ptr + size < end)
1748  memmove_extent_buffer(leaf, ptr, ptr + size,
1749  end - ptr - size);
1750  item_size -= size;
1751  btrfs_truncate_item(trans, root, path, item_size, 1);
1752  }
1754 }
1755 
1756 static noinline_for_stack
1757 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1758  struct btrfs_root *root,
1759  struct btrfs_path *path,
1760  u64 bytenr, u64 num_bytes, u64 parent,
1761  u64 root_objectid, u64 owner,
1762  u64 offset, int refs_to_add,
1763  struct btrfs_delayed_extent_op *extent_op)
1764 {
1765  struct btrfs_extent_inline_ref *iref;
1766  int ret;
1767 
1768  ret = lookup_inline_extent_backref(trans, root, path, &iref,
1769  bytenr, num_bytes, parent,
1770  root_objectid, owner, offset, 1);
1771  if (ret == 0) {
1773  update_inline_extent_backref(trans, root, path, iref,
1774  refs_to_add, extent_op);
1775  } else if (ret == -ENOENT) {
1776  setup_inline_extent_backref(trans, root, path, iref, parent,
1777  root_objectid, owner, offset,
1778  refs_to_add, extent_op);
1779  ret = 0;
1780  }
1781  return ret;
1782 }
1783 
1784 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1785  struct btrfs_root *root,
1786  struct btrfs_path *path,
1787  u64 bytenr, u64 parent, u64 root_objectid,
1788  u64 owner, u64 offset, int refs_to_add)
1789 {
1790  int ret;
1791  if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1792  BUG_ON(refs_to_add != 1);
1793  ret = insert_tree_block_ref(trans, root, path, bytenr,
1794  parent, root_objectid);
1795  } else {
1796  ret = insert_extent_data_ref(trans, root, path, bytenr,
1797  parent, root_objectid,
1798  owner, offset, refs_to_add);
1799  }
1800  return ret;
1801 }
1802 
1803 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1804  struct btrfs_root *root,
1805  struct btrfs_path *path,
1806  struct btrfs_extent_inline_ref *iref,
1807  int refs_to_drop, int is_data)
1808 {
1809  int ret = 0;
1810 
1811  BUG_ON(!is_data && refs_to_drop != 1);
1812  if (iref) {
1813  update_inline_extent_backref(trans, root, path, iref,
1814  -refs_to_drop, NULL);
1815  } else if (is_data) {
1816  ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1817  } else {
1818  ret = btrfs_del_item(trans, root, path);
1819  }
1820  return ret;
1821 }
1822 
1823 static int btrfs_issue_discard(struct block_device *bdev,
1824  u64 start, u64 len)
1825 {
1826  return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1827 }
1828 
1829 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1830  u64 num_bytes, u64 *actual_bytes)
1831 {
1832  int ret;
1833  u64 discarded_bytes = 0;
1834  struct btrfs_bio *bbio = NULL;
1835 
1836 
1837  /* Tell the block device(s) that the sectors can be discarded */
1838  ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1839  bytenr, &num_bytes, &bbio, 0);
1840  /* Error condition is -ENOMEM */
1841  if (!ret) {
1842  struct btrfs_bio_stripe *stripe = bbio->stripes;
1843  int i;
1844 
1845 
1846  for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1847  if (!stripe->dev->can_discard)
1848  continue;
1849 
1850  ret = btrfs_issue_discard(stripe->dev->bdev,
1851  stripe->physical,
1852  stripe->length);
1853  if (!ret)
1854  discarded_bytes += stripe->length;
1855  else if (ret != -EOPNOTSUPP)
1856  break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1857 
1858  /*
1859  * Just in case we get back EOPNOTSUPP for some reason,
1860  * just ignore the return value so we don't screw up
1861  * people calling discard_extent.
1862  */
1863  ret = 0;
1864  }
1865  kfree(bbio);
1866  }
1867 
1868  if (actual_bytes)
1869  *actual_bytes = discarded_bytes;
1870 
1871 
1872  return ret;
1873 }
1874 
1875 /* Can return -ENOMEM */
1877  struct btrfs_root *root,
1878  u64 bytenr, u64 num_bytes, u64 parent,
1879  u64 root_objectid, u64 owner, u64 offset, int for_cow)
1880 {
1881  int ret;
1882  struct btrfs_fs_info *fs_info = root->fs_info;
1883 
1885  root_objectid == BTRFS_TREE_LOG_OBJECTID);
1886 
1887  if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1888  ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1889  num_bytes,
1890  parent, root_objectid, (int)owner,
1891  BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1892  } else {
1893  ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1894  num_bytes,
1895  parent, root_objectid, owner, offset,
1896  BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1897  }
1898  return ret;
1899 }
1900 
1901 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1902  struct btrfs_root *root,
1903  u64 bytenr, u64 num_bytes,
1904  u64 parent, u64 root_objectid,
1905  u64 owner, u64 offset, int refs_to_add,
1906  struct btrfs_delayed_extent_op *extent_op)
1907 {
1908  struct btrfs_path *path;
1909  struct extent_buffer *leaf;
1910  struct btrfs_extent_item *item;
1911  u64 refs;
1912  int ret;
1913  int err = 0;
1914 
1915  path = btrfs_alloc_path();
1916  if (!path)
1917  return -ENOMEM;
1918 
1919  path->reada = 1;
1920  path->leave_spinning = 1;
1921  /* this will setup the path even if it fails to insert the back ref */
1922  ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1923  path, bytenr, num_bytes, parent,
1924  root_objectid, owner, offset,
1925  refs_to_add, extent_op);
1926  if (ret == 0)
1927  goto out;
1928 
1929  if (ret != -EAGAIN) {
1930  err = ret;
1931  goto out;
1932  }
1933 
1934  leaf = path->nodes[0];
1935  item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1936  refs = btrfs_extent_refs(leaf, item);
1937  btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1938  if (extent_op)
1939  __run_delayed_extent_op(extent_op, leaf, item);
1940 
1942  btrfs_release_path(path);
1943 
1944  path->reada = 1;
1945  path->leave_spinning = 1;
1946 
1947  /* now insert the actual backref */
1948  ret = insert_extent_backref(trans, root->fs_info->extent_root,
1949  path, bytenr, parent, root_objectid,
1950  owner, offset, refs_to_add);
1951  if (ret)
1952  btrfs_abort_transaction(trans, root, ret);
1953 out:
1954  btrfs_free_path(path);
1955  return err;
1956 }
1957 
1958 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1959  struct btrfs_root *root,
1960  struct btrfs_delayed_ref_node *node,
1961  struct btrfs_delayed_extent_op *extent_op,
1962  int insert_reserved)
1963 {
1964  int ret = 0;
1965  struct btrfs_delayed_data_ref *ref;
1966  struct btrfs_key ins;
1967  u64 parent = 0;
1968  u64 ref_root = 0;
1969  u64 flags = 0;
1970 
1971  ins.objectid = node->bytenr;
1972  ins.offset = node->num_bytes;
1973  ins.type = BTRFS_EXTENT_ITEM_KEY;
1974 
1975  ref = btrfs_delayed_node_to_data_ref(node);
1976  if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1977  parent = ref->parent;
1978  else
1979  ref_root = ref->root;
1980 
1981  if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1982  if (extent_op) {
1983  BUG_ON(extent_op->update_key);
1984  flags |= extent_op->flags_to_set;
1985  }
1986  ret = alloc_reserved_file_extent(trans, root,
1987  parent, ref_root, flags,
1988  ref->objectid, ref->offset,
1989  &ins, node->ref_mod);
1990  } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1991  ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1992  node->num_bytes, parent,
1993  ref_root, ref->objectid,
1994  ref->offset, node->ref_mod,
1995  extent_op);
1996  } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1997  ret = __btrfs_free_extent(trans, root, node->bytenr,
1998  node->num_bytes, parent,
1999  ref_root, ref->objectid,
2000  ref->offset, node->ref_mod,
2001  extent_op);
2002  } else {
2003  BUG();
2004  }
2005  return ret;
2006 }
2007 
2008 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2009  struct extent_buffer *leaf,
2010  struct btrfs_extent_item *ei)
2011 {
2012  u64 flags = btrfs_extent_flags(leaf, ei);
2013  if (extent_op->update_flags) {
2014  flags |= extent_op->flags_to_set;
2015  btrfs_set_extent_flags(leaf, ei, flags);
2016  }
2017 
2018  if (extent_op->update_key) {
2019  struct btrfs_tree_block_info *bi;
2020  BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2021  bi = (struct btrfs_tree_block_info *)(ei + 1);
2022  btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2023  }
2024 }
2025 
2026 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2027  struct btrfs_root *root,
2028  struct btrfs_delayed_ref_node *node,
2029  struct btrfs_delayed_extent_op *extent_op)
2030 {
2031  struct btrfs_key key;
2032  struct btrfs_path *path;
2033  struct btrfs_extent_item *ei;
2034  struct extent_buffer *leaf;
2035  u32 item_size;
2036  int ret;
2037  int err = 0;
2038 
2039  if (trans->aborted)
2040  return 0;
2041 
2042  path = btrfs_alloc_path();
2043  if (!path)
2044  return -ENOMEM;
2045 
2046  key.objectid = node->bytenr;
2048  key.offset = node->num_bytes;
2049 
2050  path->reada = 1;
2051  path->leave_spinning = 1;
2052  ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2053  path, 0, 1);
2054  if (ret < 0) {
2055  err = ret;
2056  goto out;
2057  }
2058  if (ret > 0) {
2059  err = -EIO;
2060  goto out;
2061  }
2062 
2063  leaf = path->nodes[0];
2064  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2065 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2066  if (item_size < sizeof(*ei)) {
2067  ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2068  path, (u64)-1, 0);
2069  if (ret < 0) {
2070  err = ret;
2071  goto out;
2072  }
2073  leaf = path->nodes[0];
2074  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2075  }
2076 #endif
2077  BUG_ON(item_size < sizeof(*ei));
2078  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2079  __run_delayed_extent_op(extent_op, leaf, ei);
2080 
2082 out:
2083  btrfs_free_path(path);
2084  return err;
2085 }
2086 
2087 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2088  struct btrfs_root *root,
2089  struct btrfs_delayed_ref_node *node,
2090  struct btrfs_delayed_extent_op *extent_op,
2091  int insert_reserved)
2092 {
2093  int ret = 0;
2094  struct btrfs_delayed_tree_ref *ref;
2095  struct btrfs_key ins;
2096  u64 parent = 0;
2097  u64 ref_root = 0;
2098 
2099  ins.objectid = node->bytenr;
2100  ins.offset = node->num_bytes;
2101  ins.type = BTRFS_EXTENT_ITEM_KEY;
2102 
2103  ref = btrfs_delayed_node_to_tree_ref(node);
2104  if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2105  parent = ref->parent;
2106  else
2107  ref_root = ref->root;
2108 
2109  BUG_ON(node->ref_mod != 1);
2110  if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2111  BUG_ON(!extent_op || !extent_op->update_flags ||
2112  !extent_op->update_key);
2113  ret = alloc_reserved_tree_block(trans, root,
2114  parent, ref_root,
2115  extent_op->flags_to_set,
2116  &extent_op->key,
2117  ref->level, &ins);
2118  } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2119  ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2120  node->num_bytes, parent, ref_root,
2121  ref->level, 0, 1, extent_op);
2122  } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2123  ret = __btrfs_free_extent(trans, root, node->bytenr,
2124  node->num_bytes, parent, ref_root,
2125  ref->level, 0, 1, extent_op);
2126  } else {
2127  BUG();
2128  }
2129  return ret;
2130 }
2131 
2132 /* helper function to actually process a single delayed ref entry */
2133 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2134  struct btrfs_root *root,
2135  struct btrfs_delayed_ref_node *node,
2136  struct btrfs_delayed_extent_op *extent_op,
2137  int insert_reserved)
2138 {
2139  int ret = 0;
2140 
2141  if (trans->aborted)
2142  return 0;
2143 
2144  if (btrfs_delayed_ref_is_head(node)) {
2145  struct btrfs_delayed_ref_head *head;
2146  /*
2147  * we've hit the end of the chain and we were supposed
2148  * to insert this extent into the tree. But, it got
2149  * deleted before we ever needed to insert it, so all
2150  * we have to do is clean up the accounting
2151  */
2152  BUG_ON(extent_op);
2153  head = btrfs_delayed_node_to_head(node);
2154  if (insert_reserved) {
2155  btrfs_pin_extent(root, node->bytenr,
2156  node->num_bytes, 1);
2157  if (head->is_data) {
2158  ret = btrfs_del_csums(trans, root,
2159  node->bytenr,
2160  node->num_bytes);
2161  }
2162  }
2163  mutex_unlock(&head->mutex);
2164  return ret;
2165  }
2166 
2167  if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2169  ret = run_delayed_tree_ref(trans, root, node, extent_op,
2170  insert_reserved);
2171  else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2173  ret = run_delayed_data_ref(trans, root, node, extent_op,
2174  insert_reserved);
2175  else
2176  BUG();
2177  return ret;
2178 }
2179 
2180 static noinline struct btrfs_delayed_ref_node *
2181 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2182 {
2183  struct rb_node *node;
2184  struct btrfs_delayed_ref_node *ref;
2186 again:
2187  /*
2188  * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2189  * this prevents ref count from going down to zero when
2190  * there still are pending delayed ref.
2191  */
2192  node = rb_prev(&head->node.rb_node);
2193  while (1) {
2194  if (!node)
2195  break;
2196  ref = rb_entry(node, struct btrfs_delayed_ref_node,
2197  rb_node);
2198  if (ref->bytenr != head->node.bytenr)
2199  break;
2200  if (ref->action == action)
2201  return ref;
2202  node = rb_prev(node);
2203  }
2204  if (action == BTRFS_ADD_DELAYED_REF) {
2205  action = BTRFS_DROP_DELAYED_REF;
2206  goto again;
2207  }
2208  return NULL;
2209 }
2210 
2211 /*
2212  * Returns 0 on success or if called with an already aborted transaction.
2213  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2214  */
2215 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2216  struct btrfs_root *root,
2217  struct list_head *cluster)
2218 {
2219  struct btrfs_delayed_ref_root *delayed_refs;
2220  struct btrfs_delayed_ref_node *ref;
2221  struct btrfs_delayed_ref_head *locked_ref = NULL;
2222  struct btrfs_delayed_extent_op *extent_op;
2223  struct btrfs_fs_info *fs_info = root->fs_info;
2224  int ret;
2225  int count = 0;
2226  int must_insert_reserved = 0;
2227 
2228  delayed_refs = &trans->transaction->delayed_refs;
2229  while (1) {
2230  if (!locked_ref) {
2231  /* pick a new head ref from the cluster list */
2232  if (list_empty(cluster))
2233  break;
2234 
2235  locked_ref = list_entry(cluster->next,
2236  struct btrfs_delayed_ref_head, cluster);
2237 
2238  /* grab the lock that says we are going to process
2239  * all the refs for this head */
2240  ret = btrfs_delayed_ref_lock(trans, locked_ref);
2241 
2242  /*
2243  * we may have dropped the spin lock to get the head
2244  * mutex lock, and that might have given someone else
2245  * time to free the head. If that's true, it has been
2246  * removed from our list and we can move on.
2247  */
2248  if (ret == -EAGAIN) {
2249  locked_ref = NULL;
2250  count++;
2251  continue;
2252  }
2253  }
2254 
2255  /*
2256  * We need to try and merge add/drops of the same ref since we
2257  * can run into issues with relocate dropping the implicit ref
2258  * and then it being added back again before the drop can
2259  * finish. If we merged anything we need to re-loop so we can
2260  * get a good ref.
2261  */
2262  btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2263  locked_ref);
2264 
2265  /*
2266  * locked_ref is the head node, so we have to go one
2267  * node back for any delayed ref updates
2268  */
2269  ref = select_delayed_ref(locked_ref);
2270 
2271  if (ref && ref->seq &&
2272  btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2273  /*
2274  * there are still refs with lower seq numbers in the
2275  * process of being added. Don't run this ref yet.
2276  */
2277  list_del_init(&locked_ref->cluster);
2278  mutex_unlock(&locked_ref->mutex);
2279  locked_ref = NULL;
2280  delayed_refs->num_heads_ready++;
2281  spin_unlock(&delayed_refs->lock);
2282  cond_resched();
2283  spin_lock(&delayed_refs->lock);
2284  continue;
2285  }
2286 
2287  /*
2288  * record the must insert reserved flag before we
2289  * drop the spin lock.
2290  */
2291  must_insert_reserved = locked_ref->must_insert_reserved;
2292  locked_ref->must_insert_reserved = 0;
2293 
2294  extent_op = locked_ref->extent_op;
2295  locked_ref->extent_op = NULL;
2296 
2297  if (!ref) {
2298  /* All delayed refs have been processed, Go ahead
2299  * and send the head node to run_one_delayed_ref,
2300  * so that any accounting fixes can happen
2301  */
2302  ref = &locked_ref->node;
2303 
2304  if (extent_op && must_insert_reserved) {
2305  kfree(extent_op);
2306  extent_op = NULL;
2307  }
2308 
2309  if (extent_op) {
2310  spin_unlock(&delayed_refs->lock);
2311 
2312  ret = run_delayed_extent_op(trans, root,
2313  ref, extent_op);
2314  kfree(extent_op);
2315 
2316  if (ret) {
2317  printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318  spin_lock(&delayed_refs->lock);
2319  return ret;
2320  }
2321 
2322  goto next;
2323  }
2324 
2325  list_del_init(&locked_ref->cluster);
2326  locked_ref = NULL;
2327  }
2328 
2329  ref->in_tree = 0;
2330  rb_erase(&ref->rb_node, &delayed_refs->root);
2331  delayed_refs->num_entries--;
2332  if (locked_ref) {
2333  /*
2334  * when we play the delayed ref, also correct the
2335  * ref_mod on head
2336  */
2337  switch (ref->action) {
2338  case BTRFS_ADD_DELAYED_REF:
2340  locked_ref->node.ref_mod -= ref->ref_mod;
2341  break;
2343  locked_ref->node.ref_mod += ref->ref_mod;
2344  break;
2345  default:
2346  WARN_ON(1);
2347  }
2348  }
2349  spin_unlock(&delayed_refs->lock);
2350 
2351  ret = run_one_delayed_ref(trans, root, ref, extent_op,
2352  must_insert_reserved);
2353 
2354  btrfs_put_delayed_ref(ref);
2355  kfree(extent_op);
2356  count++;
2357 
2358  if (ret) {
2359  printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360  spin_lock(&delayed_refs->lock);
2361  return ret;
2362  }
2363 
2364 next:
2365  cond_resched();
2366  spin_lock(&delayed_refs->lock);
2367  }
2368  return count;
2369 }
2370 
2371 #ifdef SCRAMBLE_DELAYED_REFS
2372 /*
2373  * Normally delayed refs get processed in ascending bytenr order. This
2374  * correlates in most cases to the order added. To expose dependencies on this
2375  * order, we start to process the tree in the middle instead of the beginning
2376  */
2377 static u64 find_middle(struct rb_root *root)
2378 {
2379  struct rb_node *n = root->rb_node;
2380  struct btrfs_delayed_ref_node *entry;
2381  int alt = 1;
2382  u64 middle;
2383  u64 first = 0, last = 0;
2384 
2385  n = rb_first(root);
2386  if (n) {
2387  entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2388  first = entry->bytenr;
2389  }
2390  n = rb_last(root);
2391  if (n) {
2392  entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2393  last = entry->bytenr;
2394  }
2395  n = root->rb_node;
2396 
2397  while (n) {
2398  entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2399  WARN_ON(!entry->in_tree);
2400 
2401  middle = entry->bytenr;
2402 
2403  if (alt)
2404  n = n->rb_left;
2405  else
2406  n = n->rb_right;
2407 
2408  alt = 1 - alt;
2409  }
2410  return middle;
2411 }
2412 #endif
2413 
2415  struct btrfs_fs_info *fs_info)
2416 {
2417  struct qgroup_update *qgroup_update;
2418  int ret = 0;
2419 
2420  if (list_empty(&trans->qgroup_ref_list) !=
2421  !trans->delayed_ref_elem.seq) {
2422  /* list without seq or seq without list */
2423  printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2424  list_empty(&trans->qgroup_ref_list) ? "" : " not",
2425  trans->delayed_ref_elem.seq);
2426  BUG();
2427  }
2428 
2429  if (!trans->delayed_ref_elem.seq)
2430  return 0;
2431 
2432  while (!list_empty(&trans->qgroup_ref_list)) {
2433  qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2434  struct qgroup_update, list);
2435  list_del(&qgroup_update->list);
2436  if (!ret)
2438  trans, fs_info, qgroup_update->node,
2439  qgroup_update->extent_op);
2440  kfree(qgroup_update);
2441  }
2442 
2443  btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2444 
2445  return ret;
2446 }
2447 
2448 /*
2449  * this starts processing the delayed reference count updates and
2450  * extent insertions we have queued up so far. count can be
2451  * 0, which means to process everything in the tree at the start
2452  * of the run (but not newly added entries), or it can be some target
2453  * number you'd like to process.
2454  *
2455  * Returns 0 on success or if called with an aborted transaction
2456  * Returns <0 on error and aborts the transaction
2457  */
2459  struct btrfs_root *root, unsigned long count)
2460 {
2461  struct rb_node *node;
2462  struct btrfs_delayed_ref_root *delayed_refs;
2463  struct btrfs_delayed_ref_node *ref;
2464  struct list_head cluster;
2465  int ret;
2466  u64 delayed_start;
2467  int run_all = count == (unsigned long)-1;
2468  int run_most = 0;
2469  int loops;
2470 
2471  /* We'll clean this up in btrfs_cleanup_transaction */
2472  if (trans->aborted)
2473  return 0;
2474 
2475  if (root == root->fs_info->extent_root)
2476  root = root->fs_info->tree_root;
2477 
2479 
2480  delayed_refs = &trans->transaction->delayed_refs;
2481  INIT_LIST_HEAD(&cluster);
2482 again:
2483  loops = 0;
2484  spin_lock(&delayed_refs->lock);
2485 
2486 #ifdef SCRAMBLE_DELAYED_REFS
2487  delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2488 #endif
2489 
2490  if (count == 0) {
2491  count = delayed_refs->num_entries * 2;
2492  run_most = 1;
2493  }
2494  while (1) {
2495  if (!(run_all || run_most) &&
2496  delayed_refs->num_heads_ready < 64)
2497  break;
2498 
2499  /*
2500  * go find something we can process in the rbtree. We start at
2501  * the beginning of the tree, and then build a cluster
2502  * of refs to process starting at the first one we are able to
2503  * lock
2504  */
2505  delayed_start = delayed_refs->run_delayed_start;
2506  ret = btrfs_find_ref_cluster(trans, &cluster,
2507  delayed_refs->run_delayed_start);
2508  if (ret)
2509  break;
2510 
2511  ret = run_clustered_refs(trans, root, &cluster);
2512  if (ret < 0) {
2513  spin_unlock(&delayed_refs->lock);
2514  btrfs_abort_transaction(trans, root, ret);
2515  return ret;
2516  }
2517 
2518  count -= min_t(unsigned long, ret, count);
2519 
2520  if (count == 0)
2521  break;
2522 
2523  if (delayed_start >= delayed_refs->run_delayed_start) {
2524  if (loops == 0) {
2525  /*
2526  * btrfs_find_ref_cluster looped. let's do one
2527  * more cycle. if we don't run any delayed ref
2528  * during that cycle (because we can't because
2529  * all of them are blocked), bail out.
2530  */
2531  loops = 1;
2532  } else {
2533  /*
2534  * no runnable refs left, stop trying
2535  */
2536  BUG_ON(run_all);
2537  break;
2538  }
2539  }
2540  if (ret) {
2541  /* refs were run, let's reset staleness detection */
2542  loops = 0;
2543  }
2544  }
2545 
2546  if (run_all) {
2547  if (!list_empty(&trans->new_bgs)) {
2548  spin_unlock(&delayed_refs->lock);
2549  btrfs_create_pending_block_groups(trans, root);
2550  spin_lock(&delayed_refs->lock);
2551  }
2552 
2553  node = rb_first(&delayed_refs->root);
2554  if (!node)
2555  goto out;
2556  count = (unsigned long)-1;
2557 
2558  while (node) {
2559  ref = rb_entry(node, struct btrfs_delayed_ref_node,
2560  rb_node);
2561  if (btrfs_delayed_ref_is_head(ref)) {
2562  struct btrfs_delayed_ref_head *head;
2563 
2564  head = btrfs_delayed_node_to_head(ref);
2565  atomic_inc(&ref->refs);
2566 
2567  spin_unlock(&delayed_refs->lock);
2568  /*
2569  * Mutex was contended, block until it's
2570  * released and try again
2571  */
2572  mutex_lock(&head->mutex);
2573  mutex_unlock(&head->mutex);
2574 
2575  btrfs_put_delayed_ref(ref);
2576  cond_resched();
2577  goto again;
2578  }
2579  node = rb_next(node);
2580  }
2581  spin_unlock(&delayed_refs->lock);
2582  schedule_timeout(1);
2583  goto again;
2584  }
2585 out:
2586  spin_unlock(&delayed_refs->lock);
2587  assert_qgroups_uptodate(trans);
2588  return 0;
2589 }
2590 
2592  struct btrfs_root *root,
2593  u64 bytenr, u64 num_bytes, u64 flags,
2594  int is_data)
2595 {
2596  struct btrfs_delayed_extent_op *extent_op;
2597  int ret;
2598 
2599  extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2600  if (!extent_op)
2601  return -ENOMEM;
2602 
2603  extent_op->flags_to_set = flags;
2604  extent_op->update_flags = 1;
2605  extent_op->update_key = 0;
2606  extent_op->is_data = is_data ? 1 : 0;
2607 
2608  ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2609  num_bytes, extent_op);
2610  if (ret)
2611  kfree(extent_op);
2612  return ret;
2613 }
2614 
2615 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2616  struct btrfs_root *root,
2617  struct btrfs_path *path,
2618  u64 objectid, u64 offset, u64 bytenr)
2619 {
2620  struct btrfs_delayed_ref_head *head;
2621  struct btrfs_delayed_ref_node *ref;
2622  struct btrfs_delayed_data_ref *data_ref;
2623  struct btrfs_delayed_ref_root *delayed_refs;
2624  struct rb_node *node;
2625  int ret = 0;
2626 
2627  ret = -ENOENT;
2628  delayed_refs = &trans->transaction->delayed_refs;
2629  spin_lock(&delayed_refs->lock);
2630  head = btrfs_find_delayed_ref_head(trans, bytenr);
2631  if (!head)
2632  goto out;
2633 
2634  if (!mutex_trylock(&head->mutex)) {
2635  atomic_inc(&head->node.refs);
2636  spin_unlock(&delayed_refs->lock);
2637 
2638  btrfs_release_path(path);
2639 
2640  /*
2641  * Mutex was contended, block until it's released and let
2642  * caller try again
2643  */
2644  mutex_lock(&head->mutex);
2645  mutex_unlock(&head->mutex);
2646  btrfs_put_delayed_ref(&head->node);
2647  return -EAGAIN;
2648  }
2649 
2650  node = rb_prev(&head->node.rb_node);
2651  if (!node)
2652  goto out_unlock;
2653 
2654  ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2655 
2656  if (ref->bytenr != bytenr)
2657  goto out_unlock;
2658 
2659  ret = 1;
2660  if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2661  goto out_unlock;
2662 
2663  data_ref = btrfs_delayed_node_to_data_ref(ref);
2664 
2665  node = rb_prev(node);
2666  if (node) {
2667  int seq = ref->seq;
2668 
2669  ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2670  if (ref->bytenr == bytenr && ref->seq == seq)
2671  goto out_unlock;
2672  }
2673 
2674  if (data_ref->root != root->root_key.objectid ||
2675  data_ref->objectid != objectid || data_ref->offset != offset)
2676  goto out_unlock;
2677 
2678  ret = 0;
2679 out_unlock:
2680  mutex_unlock(&head->mutex);
2681 out:
2682  spin_unlock(&delayed_refs->lock);
2683  return ret;
2684 }
2685 
2686 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2687  struct btrfs_root *root,
2688  struct btrfs_path *path,
2689  u64 objectid, u64 offset, u64 bytenr)
2690 {
2691  struct btrfs_root *extent_root = root->fs_info->extent_root;
2692  struct extent_buffer *leaf;
2693  struct btrfs_extent_data_ref *ref;
2694  struct btrfs_extent_inline_ref *iref;
2695  struct btrfs_extent_item *ei;
2696  struct btrfs_key key;
2697  u32 item_size;
2698  int ret;
2699 
2700  key.objectid = bytenr;
2701  key.offset = (u64)-1;
2703 
2704  ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2705  if (ret < 0)
2706  goto out;
2707  BUG_ON(ret == 0); /* Corruption */
2708 
2709  ret = -ENOENT;
2710  if (path->slots[0] == 0)
2711  goto out;
2712 
2713  path->slots[0]--;
2714  leaf = path->nodes[0];
2715  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2716 
2717  if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2718  goto out;
2719 
2720  ret = 1;
2721  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2722 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2723  if (item_size < sizeof(*ei)) {
2724  WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2725  goto out;
2726  }
2727 #endif
2728  ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2729 
2730  if (item_size != sizeof(*ei) +
2731  btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2732  goto out;
2733 
2734  if (btrfs_extent_generation(leaf, ei) <=
2735  btrfs_root_last_snapshot(&root->root_item))
2736  goto out;
2737 
2738  iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2739  if (btrfs_extent_inline_ref_type(leaf, iref) !=
2741  goto out;
2742 
2743  ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2744  if (btrfs_extent_refs(leaf, ei) !=
2745  btrfs_extent_data_ref_count(leaf, ref) ||
2746  btrfs_extent_data_ref_root(leaf, ref) !=
2747  root->root_key.objectid ||
2748  btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2749  btrfs_extent_data_ref_offset(leaf, ref) != offset)
2750  goto out;
2751 
2752  ret = 0;
2753 out:
2754  return ret;
2755 }
2756 
2758  struct btrfs_root *root,
2759  u64 objectid, u64 offset, u64 bytenr)
2760 {
2761  struct btrfs_path *path;
2762  int ret;
2763  int ret2;
2764 
2765  path = btrfs_alloc_path();
2766  if (!path)
2767  return -ENOENT;
2768 
2769  do {
2770  ret = check_committed_ref(trans, root, path, objectid,
2771  offset, bytenr);
2772  if (ret && ret != -ENOENT)
2773  goto out;
2774 
2775  ret2 = check_delayed_ref(trans, root, path, objectid,
2776  offset, bytenr);
2777  } while (ret2 == -EAGAIN);
2778 
2779  if (ret2 && ret2 != -ENOENT) {
2780  ret = ret2;
2781  goto out;
2782  }
2783 
2784  if (ret != -ENOENT || ret2 != -ENOENT)
2785  ret = 0;
2786 out:
2787  btrfs_free_path(path);
2788  if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2789  WARN_ON(ret > 0);
2790  return ret;
2791 }
2792 
2793 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2794  struct btrfs_root *root,
2795  struct extent_buffer *buf,
2796  int full_backref, int inc, int for_cow)
2797 {
2798  u64 bytenr;
2799  u64 num_bytes;
2800  u64 parent;
2801  u64 ref_root;
2802  u32 nritems;
2803  struct btrfs_key key;
2804  struct btrfs_file_extent_item *fi;
2805  int i;
2806  int level;
2807  int ret = 0;
2808  int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2809  u64, u64, u64, u64, u64, u64, int);
2810 
2811  ref_root = btrfs_header_owner(buf);
2812  nritems = btrfs_header_nritems(buf);
2813  level = btrfs_header_level(buf);
2814 
2815  if (!root->ref_cows && level == 0)
2816  return 0;
2817 
2818  if (inc)
2819  process_func = btrfs_inc_extent_ref;
2820  else
2821  process_func = btrfs_free_extent;
2822 
2823  if (full_backref)
2824  parent = buf->start;
2825  else
2826  parent = 0;
2827 
2828  for (i = 0; i < nritems; i++) {
2829  if (level == 0) {
2830  btrfs_item_key_to_cpu(buf, &key, i);
2831  if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2832  continue;
2833  fi = btrfs_item_ptr(buf, i,
2834  struct btrfs_file_extent_item);
2835  if (btrfs_file_extent_type(buf, fi) ==
2837  continue;
2838  bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2839  if (bytenr == 0)
2840  continue;
2841 
2842  num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2843  key.offset -= btrfs_file_extent_offset(buf, fi);
2844  ret = process_func(trans, root, bytenr, num_bytes,
2845  parent, ref_root, key.objectid,
2846  key.offset, for_cow);
2847  if (ret)
2848  goto fail;
2849  } else {
2850  bytenr = btrfs_node_blockptr(buf, i);
2851  num_bytes = btrfs_level_size(root, level - 1);
2852  ret = process_func(trans, root, bytenr, num_bytes,
2853  parent, ref_root, level - 1, 0,
2854  for_cow);
2855  if (ret)
2856  goto fail;
2857  }
2858  }
2859  return 0;
2860 fail:
2861  return ret;
2862 }
2863 
2864 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2865  struct extent_buffer *buf, int full_backref, int for_cow)
2866 {
2867  return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2868 }
2869 
2870 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2871  struct extent_buffer *buf, int full_backref, int for_cow)
2872 {
2873  return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2874 }
2875 
2876 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2877  struct btrfs_root *root,
2878  struct btrfs_path *path,
2879  struct btrfs_block_group_cache *cache)
2880 {
2881  int ret;
2882  struct btrfs_root *extent_root = root->fs_info->extent_root;
2883  unsigned long bi;
2884  struct extent_buffer *leaf;
2885 
2886  ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2887  if (ret < 0)
2888  goto fail;
2889  BUG_ON(ret); /* Corruption */
2890 
2891  leaf = path->nodes[0];
2892  bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2893  write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2895  btrfs_release_path(path);
2896 fail:
2897  if (ret) {
2898  btrfs_abort_transaction(trans, root, ret);
2899  return ret;
2900  }
2901  return 0;
2902 
2903 }
2904 
2905 static struct btrfs_block_group_cache *
2906 next_block_group(struct btrfs_root *root,
2907  struct btrfs_block_group_cache *cache)
2908 {
2909  struct rb_node *node;
2910  spin_lock(&root->fs_info->block_group_cache_lock);
2911  node = rb_next(&cache->cache_node);
2912  btrfs_put_block_group(cache);
2913  if (node) {
2914  cache = rb_entry(node, struct btrfs_block_group_cache,
2915  cache_node);
2916  btrfs_get_block_group(cache);
2917  } else
2918  cache = NULL;
2919  spin_unlock(&root->fs_info->block_group_cache_lock);
2920  return cache;
2921 }
2922 
2923 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2924  struct btrfs_trans_handle *trans,
2925  struct btrfs_path *path)
2926 {
2927  struct btrfs_root *root = block_group->fs_info->tree_root;
2928  struct inode *inode = NULL;
2929  u64 alloc_hint = 0;
2930  int dcs = BTRFS_DC_ERROR;
2931  int num_pages = 0;
2932  int retries = 0;
2933  int ret = 0;
2934 
2935  /*
2936  * If this block group is smaller than 100 megs don't bother caching the
2937  * block group.
2938  */
2939  if (block_group->key.offset < (100 * 1024 * 1024)) {
2940  spin_lock(&block_group->lock);
2941  block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2942  spin_unlock(&block_group->lock);
2943  return 0;
2944  }
2945 
2946 again:
2947  inode = lookup_free_space_inode(root, block_group, path);
2948  if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2949  ret = PTR_ERR(inode);
2950  btrfs_release_path(path);
2951  goto out;
2952  }
2953 
2954  if (IS_ERR(inode)) {
2955  BUG_ON(retries);
2956  retries++;
2957 
2958  if (block_group->ro)
2959  goto out_free;
2960 
2961  ret = create_free_space_inode(root, trans, block_group, path);
2962  if (ret)
2963  goto out_free;
2964  goto again;
2965  }
2966 
2967  /* We've already setup this transaction, go ahead and exit */
2968  if (block_group->cache_generation == trans->transid &&
2969  i_size_read(inode)) {
2970  dcs = BTRFS_DC_SETUP;
2971  goto out_put;
2972  }
2973 
2974  /*
2975  * We want to set the generation to 0, that way if anything goes wrong
2976  * from here on out we know not to trust this cache when we load up next
2977  * time.
2978  */
2979  BTRFS_I(inode)->generation = 0;
2980  ret = btrfs_update_inode(trans, root, inode);
2981  WARN_ON(ret);
2982 
2983  if (i_size_read(inode) > 0) {
2984  ret = btrfs_truncate_free_space_cache(root, trans, path,
2985  inode);
2986  if (ret)
2987  goto out_put;
2988  }
2989 
2990  spin_lock(&block_group->lock);
2991  if (block_group->cached != BTRFS_CACHE_FINISHED ||
2992  !btrfs_test_opt(root, SPACE_CACHE)) {
2993  /*
2994  * don't bother trying to write stuff out _if_
2995  * a) we're not cached,
2996  * b) we're with nospace_cache mount option.
2997  */
2998  dcs = BTRFS_DC_WRITTEN;
2999  spin_unlock(&block_group->lock);
3000  goto out_put;
3001  }
3002  spin_unlock(&block_group->lock);
3003 
3004  /*
3005  * Try to preallocate enough space based on how big the block group is.
3006  * Keep in mind this has to include any pinned space which could end up
3007  * taking up quite a bit since it's not folded into the other space
3008  * cache.
3009  */
3010  num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3011  if (!num_pages)
3012  num_pages = 1;
3013 
3014  num_pages *= 16;
3015  num_pages *= PAGE_CACHE_SIZE;
3016 
3017  ret = btrfs_check_data_free_space(inode, num_pages);
3018  if (ret)
3019  goto out_put;
3020 
3021  ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3022  num_pages, num_pages,
3023  &alloc_hint);
3024  if (!ret)
3025  dcs = BTRFS_DC_SETUP;
3026  btrfs_free_reserved_data_space(inode, num_pages);
3027 
3028 out_put:
3029  iput(inode);
3030 out_free:
3031  btrfs_release_path(path);
3032 out:
3033  spin_lock(&block_group->lock);
3034  if (!ret && dcs == BTRFS_DC_SETUP)
3035  block_group->cache_generation = trans->transid;
3036  block_group->disk_cache_state = dcs;
3037  spin_unlock(&block_group->lock);
3038 
3039  return ret;
3040 }
3041 
3043  struct btrfs_root *root)
3044 {
3046  int err = 0;
3047  struct btrfs_path *path;
3048  u64 last = 0;
3049 
3050  path = btrfs_alloc_path();
3051  if (!path)
3052  return -ENOMEM;
3053 
3054 again:
3055  while (1) {
3056  cache = btrfs_lookup_first_block_group(root->fs_info, last);
3057  while (cache) {
3058  if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3059  break;
3060  cache = next_block_group(root, cache);
3061  }
3062  if (!cache) {
3063  if (last == 0)
3064  break;
3065  last = 0;
3066  continue;
3067  }
3068  err = cache_save_setup(cache, trans, path);
3069  last = cache->key.objectid + cache->key.offset;
3070  btrfs_put_block_group(cache);
3071  }
3072 
3073  while (1) {
3074  if (last == 0) {
3075  err = btrfs_run_delayed_refs(trans, root,
3076  (unsigned long)-1);
3077  if (err) /* File system offline */
3078  goto out;
3079  }
3080 
3081  cache = btrfs_lookup_first_block_group(root->fs_info, last);
3082  while (cache) {
3083  if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3084  btrfs_put_block_group(cache);
3085  goto again;
3086  }
3087 
3088  if (cache->dirty)
3089  break;
3090  cache = next_block_group(root, cache);
3091  }
3092  if (!cache) {
3093  if (last == 0)
3094  break;
3095  last = 0;
3096  continue;
3097  }
3098 
3099  if (cache->disk_cache_state == BTRFS_DC_SETUP)
3101  cache->dirty = 0;
3102  last = cache->key.objectid + cache->key.offset;
3103 
3104  err = write_one_cache_group(trans, root, path, cache);
3105  if (err) /* File system offline */
3106  goto out;
3107 
3108  btrfs_put_block_group(cache);
3109  }
3110 
3111  while (1) {
3112  /*
3113  * I don't think this is needed since we're just marking our
3114  * preallocated extent as written, but just in case it can't
3115  * hurt.
3116  */
3117  if (last == 0) {
3118  err = btrfs_run_delayed_refs(trans, root,
3119  (unsigned long)-1);
3120  if (err) /* File system offline */
3121  goto out;
3122  }
3123 
3124  cache = btrfs_lookup_first_block_group(root->fs_info, last);
3125  while (cache) {
3126  /*
3127  * Really this shouldn't happen, but it could if we
3128  * couldn't write the entire preallocated extent and
3129  * splitting the extent resulted in a new block.
3130  */
3131  if (cache->dirty) {
3132  btrfs_put_block_group(cache);
3133  goto again;
3134  }
3135  if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3136  break;
3137  cache = next_block_group(root, cache);
3138  }
3139  if (!cache) {
3140  if (last == 0)
3141  break;
3142  last = 0;
3143  continue;
3144  }
3145 
3146  err = btrfs_write_out_cache(root, trans, cache, path);
3147 
3148  /*
3149  * If we didn't have an error then the cache state is still
3150  * NEED_WRITE, so we can set it to WRITTEN.
3151  */
3152  if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3154  last = cache->key.objectid + cache->key.offset;
3155  btrfs_put_block_group(cache);
3156  }
3157 out:
3158 
3159  btrfs_free_path(path);
3160  return err;
3161 }
3162 
3163 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3164 {
3166  int readonly = 0;
3167 
3168  block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3169  if (!block_group || block_group->ro)
3170  readonly = 1;
3171  if (block_group)
3172  btrfs_put_block_group(block_group);
3173  return readonly;
3174 }
3175 
3176 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3177  u64 total_bytes, u64 bytes_used,
3178  struct btrfs_space_info **space_info)
3179 {
3180  struct btrfs_space_info *found;
3181  int i;
3182  int factor;
3183 
3186  factor = 2;
3187  else
3188  factor = 1;
3189 
3190  found = __find_space_info(info, flags);
3191  if (found) {
3192  spin_lock(&found->lock);
3193  found->total_bytes += total_bytes;
3194  found->disk_total += total_bytes * factor;
3195  found->bytes_used += bytes_used;
3196  found->disk_used += bytes_used * factor;
3197  found->full = 0;
3198  spin_unlock(&found->lock);
3199  *space_info = found;
3200  return 0;
3201  }
3202  found = kzalloc(sizeof(*found), GFP_NOFS);
3203  if (!found)
3204  return -ENOMEM;
3205 
3206  for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3207  INIT_LIST_HEAD(&found->block_groups[i]);
3208  init_rwsem(&found->groups_sem);
3209  spin_lock_init(&found->lock);
3210  found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3211  found->total_bytes = total_bytes;
3212  found->disk_total = total_bytes * factor;
3213  found->bytes_used = bytes_used;
3214  found->disk_used = bytes_used * factor;
3215  found->bytes_pinned = 0;
3216  found->bytes_reserved = 0;
3217  found->bytes_readonly = 0;
3218  found->bytes_may_use = 0;
3219  found->full = 0;
3221  found->chunk_alloc = 0;
3222  found->flush = 0;
3223  init_waitqueue_head(&found->wait);
3224  *space_info = found;
3225  list_add_rcu(&found->list, &info->space_info);
3226  if (flags & BTRFS_BLOCK_GROUP_DATA)
3227  info->data_sinfo = found;
3228  return 0;
3229 }
3230 
3231 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3232 {
3233  u64 extra_flags = chunk_to_extended(flags) &
3235 
3236  if (flags & BTRFS_BLOCK_GROUP_DATA)
3237  fs_info->avail_data_alloc_bits |= extra_flags;
3238  if (flags & BTRFS_BLOCK_GROUP_METADATA)
3239  fs_info->avail_metadata_alloc_bits |= extra_flags;
3240  if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3241  fs_info->avail_system_alloc_bits |= extra_flags;
3242 }
3243 
3244 /*
3245  * returns target flags in extended format or 0 if restripe for this
3246  * chunk_type is not in progress
3247  *
3248  * should be called with either volume_mutex or balance_lock held
3249  */
3250 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3251 {
3252  struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3253  u64 target = 0;
3254 
3255  if (!bctl)
3256  return 0;
3257 
3258  if (flags & BTRFS_BLOCK_GROUP_DATA &&
3259  bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3260  target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3261  } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3262  bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3263  target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3264  } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3265  bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3266  target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3267  }
3268 
3269  return target;
3270 }
3271 
3272 /*
3273  * @flags: available profiles in extended format (see ctree.h)
3274  *
3275  * Returns reduced profile in chunk format. If profile changing is in
3276  * progress (either running or paused) picks the target profile (if it's
3277  * already available), otherwise falls back to plain reducing.
3278  */
3279 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3280 {
3281  /*
3282  * we add in the count of missing devices because we want
3283  * to make sure that any RAID levels on a degraded FS
3284  * continue to be honored.
3285  */
3286  u64 num_devices = root->fs_info->fs_devices->rw_devices +
3287  root->fs_info->fs_devices->missing_devices;
3288  u64 target;
3289 
3290  /*
3291  * see if restripe for this chunk_type is in progress, if so
3292  * try to reduce to the target profile
3293  */
3294  spin_lock(&root->fs_info->balance_lock);
3295  target = get_restripe_target(root->fs_info, flags);
3296  if (target) {
3297  /* pick target profile only if it's already available */
3298  if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3299  spin_unlock(&root->fs_info->balance_lock);
3300  return extended_to_chunk(target);
3301  }
3302  }
3303  spin_unlock(&root->fs_info->balance_lock);
3304 
3305  if (num_devices == 1)
3307  if (num_devices < 4)
3308  flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3309 
3310  if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3311  (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3313  flags &= ~BTRFS_BLOCK_GROUP_DUP;
3314  }
3315 
3316  if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3317  (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3318  flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3319  }
3320 
3321  if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3322  ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3323  (flags & BTRFS_BLOCK_GROUP_RAID10) |
3324  (flags & BTRFS_BLOCK_GROUP_DUP))) {
3325  flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3326  }
3327 
3328  return extended_to_chunk(flags);
3329 }
3330 
3331 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3332 {
3333  if (flags & BTRFS_BLOCK_GROUP_DATA)
3334  flags |= root->fs_info->avail_data_alloc_bits;
3335  else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3336  flags |= root->fs_info->avail_system_alloc_bits;
3337  else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3338  flags |= root->fs_info->avail_metadata_alloc_bits;
3339 
3340  return btrfs_reduce_alloc_profile(root, flags);
3341 }
3342 
3344 {
3345  u64 flags;
3346 
3347  if (data)
3348  flags = BTRFS_BLOCK_GROUP_DATA;
3349  else if (root == root->fs_info->chunk_root)
3350  flags = BTRFS_BLOCK_GROUP_SYSTEM;
3351  else
3353 
3354  return get_alloc_profile(root, flags);
3355 }
3356 
3357 /*
3358  * This will check the space that the inode allocates from to make sure we have
3359  * enough space for bytes.
3360  */
3361 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3362 {
3363  struct btrfs_space_info *data_sinfo;
3364  struct btrfs_root *root = BTRFS_I(inode)->root;
3365  struct btrfs_fs_info *fs_info = root->fs_info;
3366  u64 used;
3367  int ret = 0, committed = 0, alloc_chunk = 1;
3368 
3369  /* make sure bytes are sectorsize aligned */
3370  bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3371 
3372  if (root == root->fs_info->tree_root ||
3373  BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3374  alloc_chunk = 0;
3375  committed = 1;
3376  }
3377 
3378  data_sinfo = fs_info->data_sinfo;
3379  if (!data_sinfo)
3380  goto alloc;
3381 
3382 again:
3383  /* make sure we have enough space to handle the data first */
3384  spin_lock(&data_sinfo->lock);
3385  used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3386  data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3387  data_sinfo->bytes_may_use;
3388 
3389  if (used + bytes > data_sinfo->total_bytes) {
3390  struct btrfs_trans_handle *trans;
3391 
3392  /*
3393  * if we don't have enough free bytes in this space then we need
3394  * to alloc a new chunk.
3395  */
3396  if (!data_sinfo->full && alloc_chunk) {
3397  u64 alloc_target;
3398 
3399  data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3400  spin_unlock(&data_sinfo->lock);
3401 alloc:
3402  alloc_target = btrfs_get_alloc_profile(root, 1);
3403  trans = btrfs_join_transaction(root);
3404  if (IS_ERR(trans))
3405  return PTR_ERR(trans);
3406 
3407  ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3408  alloc_target,
3410  btrfs_end_transaction(trans, root);
3411  if (ret < 0) {
3412  if (ret != -ENOSPC)
3413  return ret;
3414  else
3415  goto commit_trans;
3416  }
3417 
3418  if (!data_sinfo)
3419  data_sinfo = fs_info->data_sinfo;
3420 
3421  goto again;
3422  }
3423 
3424  /*
3425  * If we have less pinned bytes than we want to allocate then
3426  * don't bother committing the transaction, it won't help us.
3427  */
3428  if (data_sinfo->bytes_pinned < bytes)
3429  committed = 1;
3430  spin_unlock(&data_sinfo->lock);
3431 
3432  /* commit the current transaction and try again */
3433 commit_trans:
3434  if (!committed &&
3435  !atomic_read(&root->fs_info->open_ioctl_trans)) {
3436  committed = 1;
3437  trans = btrfs_join_transaction(root);
3438  if (IS_ERR(trans))
3439  return PTR_ERR(trans);
3440  ret = btrfs_commit_transaction(trans, root);
3441  if (ret)
3442  return ret;
3443  goto again;
3444  }
3445 
3446  return -ENOSPC;
3447  }
3448  data_sinfo->bytes_may_use += bytes;
3449  trace_btrfs_space_reservation(root->fs_info, "space_info",
3450  data_sinfo->flags, bytes, 1);
3451  spin_unlock(&data_sinfo->lock);
3452 
3453  return 0;
3454 }
3455 
3456 /*
3457  * Called if we need to clear a data reservation for this inode.
3458  */
3459 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3460 {
3461  struct btrfs_root *root = BTRFS_I(inode)->root;
3462  struct btrfs_space_info *data_sinfo;
3463 
3464  /* make sure bytes are sectorsize aligned */
3465  bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3466 
3467  data_sinfo = root->fs_info->data_sinfo;
3468  spin_lock(&data_sinfo->lock);
3469  data_sinfo->bytes_may_use -= bytes;
3470  trace_btrfs_space_reservation(root->fs_info, "space_info",
3471  data_sinfo->flags, bytes, 0);
3472  spin_unlock(&data_sinfo->lock);
3473 }
3474 
3475 static void force_metadata_allocation(struct btrfs_fs_info *info)
3476 {
3477  struct list_head *head = &info->space_info;
3478  struct btrfs_space_info *found;
3479 
3480  rcu_read_lock();
3481  list_for_each_entry_rcu(found, head, list) {
3482  if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3483  found->force_alloc = CHUNK_ALLOC_FORCE;
3484  }
3485  rcu_read_unlock();
3486 }
3487 
3488 static int should_alloc_chunk(struct btrfs_root *root,
3489  struct btrfs_space_info *sinfo, int force)
3490 {
3491  struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3492  u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3493  u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3494  u64 thresh;
3495 
3496  if (force == CHUNK_ALLOC_FORCE)
3497  return 1;
3498 
3499  /*
3500  * We need to take into account the global rsv because for all intents
3501  * and purposes it's used space. Don't worry about locking the
3502  * global_rsv, it doesn't change except when the transaction commits.
3503  */
3504  if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505  num_allocated += global_rsv->size;
3506 
3507  /*
3508  * in limited mode, we want to have some free space up to
3509  * about 1% of the FS size.
3510  */
3511  if (force == CHUNK_ALLOC_LIMITED) {
3512  thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3513  thresh = max_t(u64, 64 * 1024 * 1024,
3514  div_factor_fine(thresh, 1));
3515 
3516  if (num_bytes - num_allocated < thresh)
3517  return 1;
3518  }
3519 
3520  if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3521  return 0;
3522  return 1;
3523 }
3524 
3525 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3526 {
3527  u64 num_dev;
3528 
3529  if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3530  type & BTRFS_BLOCK_GROUP_RAID0)
3531  num_dev = root->fs_info->fs_devices->rw_devices;
3532  else if (type & BTRFS_BLOCK_GROUP_RAID1)
3533  num_dev = 2;
3534  else
3535  num_dev = 1; /* DUP or single */
3536 
3537  /* metadata for updaing devices and chunk tree */
3538  return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3539 }
3540 
3541 static void check_system_chunk(struct btrfs_trans_handle *trans,
3542  struct btrfs_root *root, u64 type)
3543 {
3544  struct btrfs_space_info *info;
3545  u64 left;
3546  u64 thresh;
3547 
3548  info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3549  spin_lock(&info->lock);
3550  left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3551  info->bytes_reserved - info->bytes_readonly;
3552  spin_unlock(&info->lock);
3553 
3554  thresh = get_system_chunk_thresh(root, type);
3555  if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3556  printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3557  left, thresh, type);
3558  dump_space_info(info, 0, 0);
3559  }
3560 
3561  if (left < thresh) {
3562  u64 flags;
3563 
3564  flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3565  btrfs_alloc_chunk(trans, root, flags);
3566  }
3567 }
3568 
3569 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3570  struct btrfs_root *extent_root, u64 flags, int force)
3571 {
3572  struct btrfs_space_info *space_info;
3573  struct btrfs_fs_info *fs_info = extent_root->fs_info;
3574  int wait_for_alloc = 0;
3575  int ret = 0;
3576 
3577  space_info = __find_space_info(extent_root->fs_info, flags);
3578  if (!space_info) {
3579  ret = update_space_info(extent_root->fs_info, flags,
3580  0, 0, &space_info);
3581  BUG_ON(ret); /* -ENOMEM */
3582  }
3583  BUG_ON(!space_info); /* Logic error */
3584 
3585 again:
3586  spin_lock(&space_info->lock);
3587  if (force < space_info->force_alloc)
3588  force = space_info->force_alloc;
3589  if (space_info->full) {
3590  spin_unlock(&space_info->lock);
3591  return 0;
3592  }
3593 
3594  if (!should_alloc_chunk(extent_root, space_info, force)) {
3595  spin_unlock(&space_info->lock);
3596  return 0;
3597  } else if (space_info->chunk_alloc) {
3598  wait_for_alloc = 1;
3599  } else {
3600  space_info->chunk_alloc = 1;
3601  }
3602 
3603  spin_unlock(&space_info->lock);
3604 
3605  mutex_lock(&fs_info->chunk_mutex);
3606 
3607  /*
3608  * The chunk_mutex is held throughout the entirety of a chunk
3609  * allocation, so once we've acquired the chunk_mutex we know that the
3610  * other guy is done and we need to recheck and see if we should
3611  * allocate.
3612  */
3613  if (wait_for_alloc) {
3614  mutex_unlock(&fs_info->chunk_mutex);
3615  wait_for_alloc = 0;
3616  goto again;
3617  }
3618 
3619  /*
3620  * If we have mixed data/metadata chunks we want to make sure we keep
3621  * allocating mixed chunks instead of individual chunks.
3622  */
3623  if (btrfs_mixed_space_info(space_info))
3624  flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3625 
3626  /*
3627  * if we're doing a data chunk, go ahead and make sure that
3628  * we keep a reasonable number of metadata chunks allocated in the
3629  * FS as well.
3630  */
3631  if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3632  fs_info->data_chunk_allocations++;
3633  if (!(fs_info->data_chunk_allocations %
3634  fs_info->metadata_ratio))
3635  force_metadata_allocation(fs_info);
3636  }
3637 
3638  /*
3639  * Check if we have enough space in SYSTEM chunk because we may need
3640  * to update devices.
3641  */
3642  check_system_chunk(trans, extent_root, flags);
3643 
3644  ret = btrfs_alloc_chunk(trans, extent_root, flags);
3645  if (ret < 0 && ret != -ENOSPC)
3646  goto out;
3647 
3648  spin_lock(&space_info->lock);
3649  if (ret)
3650  space_info->full = 1;
3651  else
3652  ret = 1;
3653 
3654  space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3655  space_info->chunk_alloc = 0;
3656  spin_unlock(&space_info->lock);
3657 out:
3658  mutex_unlock(&fs_info->chunk_mutex);
3659  return ret;
3660 }
3661 
3662 static int can_overcommit(struct btrfs_root *root,
3663  struct btrfs_space_info *space_info, u64 bytes,
3664  int flush)
3665 {
3666  u64 profile = btrfs_get_alloc_profile(root, 0);
3667  u64 avail;
3668  u64 used;
3669 
3670  used = space_info->bytes_used + space_info->bytes_reserved +
3671  space_info->bytes_pinned + space_info->bytes_readonly +
3672  space_info->bytes_may_use;
3673 
3674  spin_lock(&root->fs_info->free_chunk_lock);
3675  avail = root->fs_info->free_chunk_space;
3676  spin_unlock(&root->fs_info->free_chunk_lock);
3677 
3678  /*
3679  * If we have dup, raid1 or raid10 then only half of the free
3680  * space is actually useable.
3681  */
3682  if (profile & (BTRFS_BLOCK_GROUP_DUP |
3685  avail >>= 1;
3686 
3687  /*
3688  * If we aren't flushing don't let us overcommit too much, say
3689  * 1/8th of the space. If we can flush, let it overcommit up to
3690  * 1/2 of the space.
3691  */
3692  if (flush)
3693  avail >>= 3;
3694  else
3695  avail >>= 1;
3696 
3697  if (used + bytes < space_info->total_bytes + avail)
3698  return 1;
3699  return 0;
3700 }
3701 
3702 /*
3703  * shrink metadata reservation for delalloc
3704  */
3705 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3706  bool wait_ordered)
3707 {
3708  struct btrfs_block_rsv *block_rsv;
3709  struct btrfs_space_info *space_info;
3710  struct btrfs_trans_handle *trans;
3711  u64 delalloc_bytes;
3712  u64 max_reclaim;
3713  long time_left;
3714  unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715  int loops = 0;
3716 
3717  trans = (struct btrfs_trans_handle *)current->journal_info;
3718  block_rsv = &root->fs_info->delalloc_block_rsv;
3719  space_info = block_rsv->space_info;
3720 
3721  smp_mb();
3722  delalloc_bytes = root->fs_info->delalloc_bytes;
3723  if (delalloc_bytes == 0) {
3724  if (trans)
3725  return;
3726  btrfs_wait_ordered_extents(root, 0);
3727  return;
3728  }
3729 
3730  while (delalloc_bytes && loops < 3) {
3731  max_reclaim = min(delalloc_bytes, to_reclaim);
3732  nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733  writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3735 
3736  /*
3737  * We need to wait for the async pages to actually start before
3738  * we do anything.
3739  */
3740  wait_event(root->fs_info->async_submit_wait,
3741  !atomic_read(&root->fs_info->async_delalloc_pages));
3742 
3743  spin_lock(&space_info->lock);
3744  if (can_overcommit(root, space_info, orig, !trans)) {
3745  spin_unlock(&space_info->lock);
3746  break;
3747  }
3748  spin_unlock(&space_info->lock);
3749 
3750  loops++;
3751  if (wait_ordered && !trans) {
3752  btrfs_wait_ordered_extents(root, 0);
3753  } else {
3754  time_left = schedule_timeout_killable(1);
3755  if (time_left)
3756  break;
3757  }
3758  smp_mb();
3759  delalloc_bytes = root->fs_info->delalloc_bytes;
3760  }
3761 }
3762 
3773 static int may_commit_transaction(struct btrfs_root *root,
3774  struct btrfs_space_info *space_info,
3775  u64 bytes, int force)
3776 {
3777  struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3778  struct btrfs_trans_handle *trans;
3779 
3780  trans = (struct btrfs_trans_handle *)current->journal_info;
3781  if (trans)
3782  return -EAGAIN;
3783 
3784  if (force)
3785  goto commit;
3786 
3787  /* See if there is enough pinned space to make this reservation */
3788  spin_lock(&space_info->lock);
3789  if (space_info->bytes_pinned >= bytes) {
3790  spin_unlock(&space_info->lock);
3791  goto commit;
3792  }
3793  spin_unlock(&space_info->lock);
3794 
3795  /*
3796  * See if there is some space in the delayed insertion reservation for
3797  * this reservation.
3798  */
3799  if (space_info != delayed_rsv->space_info)
3800  return -ENOSPC;
3801 
3802  spin_lock(&space_info->lock);
3803  spin_lock(&delayed_rsv->lock);
3804  if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3805  spin_unlock(&delayed_rsv->lock);
3806  spin_unlock(&space_info->lock);
3807  return -ENOSPC;
3808  }
3809  spin_unlock(&delayed_rsv->lock);
3810  spin_unlock(&space_info->lock);
3811 
3812 commit:
3813  trans = btrfs_join_transaction(root);
3814  if (IS_ERR(trans))
3815  return -ENOSPC;
3816 
3817  return btrfs_commit_transaction(trans, root);
3818 }
3819 
3827 };
3828 
3829 static int flush_space(struct btrfs_root *root,
3830  struct btrfs_space_info *space_info, u64 num_bytes,
3831  u64 orig_bytes, int state)
3832 {
3833  struct btrfs_trans_handle *trans;
3834  int nr;
3835  int ret = 0;
3836 
3837  switch (state) {
3839  case FLUSH_DELAYED_ITEMS:
3840  if (state == FLUSH_DELAYED_ITEMS_NR) {
3841  u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3842 
3843  nr = (int)div64_u64(num_bytes, bytes);
3844  if (!nr)
3845  nr = 1;
3846  nr *= 2;
3847  } else {
3848  nr = -1;
3849  }
3850  trans = btrfs_join_transaction(root);
3851  if (IS_ERR(trans)) {
3852  ret = PTR_ERR(trans);
3853  break;
3854  }
3855  ret = btrfs_run_delayed_items_nr(trans, root, nr);
3856  btrfs_end_transaction(trans, root);
3857  break;
3858  case FLUSH_DELALLOC:
3859  case FLUSH_DELALLOC_WAIT:
3860  shrink_delalloc(root, num_bytes, orig_bytes,
3861  state == FLUSH_DELALLOC_WAIT);
3862  break;
3863  case ALLOC_CHUNK:
3864  trans = btrfs_join_transaction(root);
3865  if (IS_ERR(trans)) {
3866  ret = PTR_ERR(trans);
3867  break;
3868  }
3869  ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870  btrfs_get_alloc_profile(root, 0),
3872  btrfs_end_transaction(trans, root);
3873  if (ret == -ENOSPC)
3874  ret = 0;
3875  break;
3876  case COMMIT_TRANS:
3877  ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3878  break;
3879  default:
3880  ret = -ENOSPC;
3881  break;
3882  }
3883 
3884  return ret;
3885 }
3900 static int reserve_metadata_bytes(struct btrfs_root *root,
3901  struct btrfs_block_rsv *block_rsv,
3902  u64 orig_bytes, int flush)
3903 {
3904  struct btrfs_space_info *space_info = block_rsv->space_info;
3905  u64 used;
3906  u64 num_bytes = orig_bytes;
3908  int ret = 0;
3909  bool flushing = false;
3910 
3911 again:
3912  ret = 0;
3913  spin_lock(&space_info->lock);
3914  /*
3915  * We only want to wait if somebody other than us is flushing and we are
3916  * actually alloed to flush.
3917  */
3918  while (flush && !flushing && space_info->flush) {
3919  spin_unlock(&space_info->lock);
3920  /*
3921  * If we have a trans handle we can't wait because the flusher
3922  * may have to commit the transaction, which would mean we would
3923  * deadlock since we are waiting for the flusher to finish, but
3924  * hold the current transaction open.
3925  */
3926  if (current->journal_info)
3927  return -EAGAIN;
3928  ret = wait_event_killable(space_info->wait, !space_info->flush);
3929  /* Must have been killed, return */
3930  if (ret)
3931  return -EINTR;
3932 
3933  spin_lock(&space_info->lock);
3934  }
3935 
3936  ret = -ENOSPC;
3937  used = space_info->bytes_used + space_info->bytes_reserved +
3938  space_info->bytes_pinned + space_info->bytes_readonly +
3939  space_info->bytes_may_use;
3940 
3941  /*
3942  * The idea here is that we've not already over-reserved the block group
3943  * then we can go ahead and save our reservation first and then start
3944  * flushing if we need to. Otherwise if we've already overcommitted
3945  * lets start flushing stuff first and then come back and try to make
3946  * our reservation.
3947  */
3948  if (used <= space_info->total_bytes) {
3949  if (used + orig_bytes <= space_info->total_bytes) {
3950  space_info->bytes_may_use += orig_bytes;
3951  trace_btrfs_space_reservation(root->fs_info,
3952  "space_info", space_info->flags, orig_bytes, 1);
3953  ret = 0;
3954  } else {
3955  /*
3956  * Ok set num_bytes to orig_bytes since we aren't
3957  * overocmmitted, this way we only try and reclaim what
3958  * we need.
3959  */
3960  num_bytes = orig_bytes;
3961  }
3962  } else {
3963  /*
3964  * Ok we're over committed, set num_bytes to the overcommitted
3965  * amount plus the amount of bytes that we need for this
3966  * reservation.
3967  */
3968  num_bytes = used - space_info->total_bytes +
3969  (orig_bytes * 2);
3970  }
3971 
3972  if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3973  space_info->bytes_may_use += orig_bytes;
3974  trace_btrfs_space_reservation(root->fs_info, "space_info",
3975  space_info->flags, orig_bytes,
3976  1);
3977  ret = 0;
3978  }
3979 
3980  /*
3981  * Couldn't make our reservation, save our place so while we're trying
3982  * to reclaim space we can actually use it instead of somebody else
3983  * stealing it from us.
3984  */
3985  if (ret && flush) {
3986  flushing = true;
3987  space_info->flush = 1;
3988  }
3989 
3990  spin_unlock(&space_info->lock);
3991 
3992  if (!ret || !flush)
3993  goto out;
3994 
3995  ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996  flush_state);
3997  flush_state++;
3998  if (!ret)
3999  goto again;
4000  else if (flush_state <= COMMIT_TRANS)
4001  goto again;
4002 
4003 out:
4004  if (flushing) {
4005  spin_lock(&space_info->lock);
4006  space_info->flush = 0;
4007  wake_up_all(&space_info->wait);
4008  spin_unlock(&space_info->lock);
4009  }
4010  return ret;
4011 }
4012 
4013 static struct btrfs_block_rsv *get_block_rsv(
4014  const struct btrfs_trans_handle *trans,
4015  const struct btrfs_root *root)
4016 {
4017  struct btrfs_block_rsv *block_rsv = NULL;
4018 
4019  if (root->ref_cows)
4020  block_rsv = trans->block_rsv;
4021 
4022  if (root == root->fs_info->csum_root && trans->adding_csums)
4023  block_rsv = trans->block_rsv;
4024 
4025  if (!block_rsv)
4026  block_rsv = root->block_rsv;
4027 
4028  if (!block_rsv)
4029  block_rsv = &root->fs_info->empty_block_rsv;
4030 
4031  return block_rsv;
4032 }
4033 
4034 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4035  u64 num_bytes)
4036 {
4037  int ret = -ENOSPC;
4038  spin_lock(&block_rsv->lock);
4039  if (block_rsv->reserved >= num_bytes) {
4040  block_rsv->reserved -= num_bytes;
4041  if (block_rsv->reserved < block_rsv->size)
4042  block_rsv->full = 0;
4043  ret = 0;
4044  }
4045  spin_unlock(&block_rsv->lock);
4046  return ret;
4047 }
4048 
4049 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4050  u64 num_bytes, int update_size)
4051 {
4052  spin_lock(&block_rsv->lock);
4053  block_rsv->reserved += num_bytes;
4054  if (update_size)
4055  block_rsv->size += num_bytes;
4056  else if (block_rsv->reserved >= block_rsv->size)
4057  block_rsv->full = 1;
4058  spin_unlock(&block_rsv->lock);
4059 }
4060 
4061 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4062  struct btrfs_block_rsv *block_rsv,
4063  struct btrfs_block_rsv *dest, u64 num_bytes)
4064 {
4065  struct btrfs_space_info *space_info = block_rsv->space_info;
4066 
4067  spin_lock(&block_rsv->lock);
4068  if (num_bytes == (u64)-1)
4069  num_bytes = block_rsv->size;
4070  block_rsv->size -= num_bytes;
4071  if (block_rsv->reserved >= block_rsv->size) {
4072  num_bytes = block_rsv->reserved - block_rsv->size;
4073  block_rsv->reserved = block_rsv->size;
4074  block_rsv->full = 1;
4075  } else {
4076  num_bytes = 0;
4077  }
4078  spin_unlock(&block_rsv->lock);
4079 
4080  if (num_bytes > 0) {
4081  if (dest) {
4082  spin_lock(&dest->lock);
4083  if (!dest->full) {
4084  u64 bytes_to_add;
4085 
4086  bytes_to_add = dest->size - dest->reserved;
4087  bytes_to_add = min(num_bytes, bytes_to_add);
4088  dest->reserved += bytes_to_add;
4089  if (dest->reserved >= dest->size)
4090  dest->full = 1;
4091  num_bytes -= bytes_to_add;
4092  }
4093  spin_unlock(&dest->lock);
4094  }
4095  if (num_bytes) {
4096  spin_lock(&space_info->lock);
4097  space_info->bytes_may_use -= num_bytes;
4098  trace_btrfs_space_reservation(fs_info, "space_info",
4099  space_info->flags, num_bytes, 0);
4100  space_info->reservation_progress++;
4101  spin_unlock(&space_info->lock);
4102  }
4103  }
4104 }
4105 
4106 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4107  struct btrfs_block_rsv *dst, u64 num_bytes)
4108 {
4109  int ret;
4110 
4111  ret = block_rsv_use_bytes(src, num_bytes);
4112  if (ret)
4113  return ret;
4114 
4115  block_rsv_add_bytes(dst, num_bytes, 1);
4116  return 0;
4117 }
4118 
4119 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4120 {
4121  memset(rsv, 0, sizeof(*rsv));
4122  spin_lock_init(&rsv->lock);
4123  rsv->type = type;
4124 }
4125 
4127  unsigned short type)
4128 {
4129  struct btrfs_block_rsv *block_rsv;
4130  struct btrfs_fs_info *fs_info = root->fs_info;
4131 
4132  block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4133  if (!block_rsv)
4134  return NULL;
4135 
4136  btrfs_init_block_rsv(block_rsv, type);
4137  block_rsv->space_info = __find_space_info(fs_info,
4138  BTRFS_BLOCK_GROUP_METADATA);
4139  return block_rsv;
4140 }
4141 
4143  struct btrfs_block_rsv *rsv)
4144 {
4145  if (!rsv)
4146  return;
4147  btrfs_block_rsv_release(root, rsv, (u64)-1);
4148  kfree(rsv);
4149 }
4150 
4151 static inline int __block_rsv_add(struct btrfs_root *root,
4152  struct btrfs_block_rsv *block_rsv,
4153  u64 num_bytes, int flush)
4154 {
4155  int ret;
4156 
4157  if (num_bytes == 0)
4158  return 0;
4159 
4160  ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4161  if (!ret) {
4162  block_rsv_add_bytes(block_rsv, num_bytes, 1);
4163  return 0;
4164  }
4165 
4166  return ret;
4167 }
4168 
4170  struct btrfs_block_rsv *block_rsv,
4171  u64 num_bytes)
4172 {
4173  return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174 }
4175 
4177  struct btrfs_block_rsv *block_rsv,
4178  u64 num_bytes)
4179 {
4180  return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181 }
4182 
4184  struct btrfs_block_rsv *block_rsv, int min_factor)
4185 {
4186  u64 num_bytes = 0;
4187  int ret = -ENOSPC;
4188 
4189  if (!block_rsv)
4190  return 0;
4191 
4192  spin_lock(&block_rsv->lock);
4193  num_bytes = div_factor(block_rsv->size, min_factor);
4194  if (block_rsv->reserved >= num_bytes)
4195  ret = 0;
4196  spin_unlock(&block_rsv->lock);
4197 
4198  return ret;
4199 }
4200 
4201 static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4202  struct btrfs_block_rsv *block_rsv,
4203  u64 min_reserved, int flush)
4204 {
4205  u64 num_bytes = 0;
4206  int ret = -ENOSPC;
4207 
4208  if (!block_rsv)
4209  return 0;
4210 
4211  spin_lock(&block_rsv->lock);
4212  num_bytes = min_reserved;
4213  if (block_rsv->reserved >= num_bytes)
4214  ret = 0;
4215  else
4216  num_bytes -= block_rsv->reserved;
4217  spin_unlock(&block_rsv->lock);
4218 
4219  if (!ret)
4220  return 0;
4221 
4222  ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4223  if (!ret) {
4224  block_rsv_add_bytes(block_rsv, num_bytes, 0);
4225  return 0;
4226  }
4227 
4228  return ret;
4229 }
4230 
4232  struct btrfs_block_rsv *block_rsv,
4233  u64 min_reserved)
4234 {
4235  return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236 }
4237 
4239  struct btrfs_block_rsv *block_rsv,
4240  u64 min_reserved)
4241 {
4242  return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243 }
4244 
4246  struct btrfs_block_rsv *dst_rsv,
4247  u64 num_bytes)
4248 {
4249  return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4250 }
4251 
4253  struct btrfs_block_rsv *block_rsv,
4254  u64 num_bytes)
4255 {
4256  struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4257  if (global_rsv->full || global_rsv == block_rsv ||
4258  block_rsv->space_info != global_rsv->space_info)
4259  global_rsv = NULL;
4260  block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4261  num_bytes);
4262 }
4263 
4264 /*
4265  * helper to calculate size of global block reservation.
4266  * the desired value is sum of space used by extent tree,
4267  * checksum tree and root tree
4268  */
4269 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4270 {
4271  struct btrfs_space_info *sinfo;
4272  u64 num_bytes;
4273  u64 meta_used;
4274  u64 data_used;
4275  int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4276 
4277  sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4278  spin_lock(&sinfo->lock);
4279  data_used = sinfo->bytes_used;
4280  spin_unlock(&sinfo->lock);
4281 
4282  sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4283  spin_lock(&sinfo->lock);
4284  if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4285  data_used = 0;
4286  meta_used = sinfo->bytes_used;
4287  spin_unlock(&sinfo->lock);
4288 
4289  num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4290  csum_size * 2;
4291  num_bytes += div64_u64(data_used + meta_used, 50);
4292 
4293  if (num_bytes * 3 > meta_used)
4294  num_bytes = div64_u64(meta_used, 3);
4295 
4296  return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4297 }
4298 
4299 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4300 {
4301  struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4302  struct btrfs_space_info *sinfo = block_rsv->space_info;
4303  u64 num_bytes;
4304 
4305  num_bytes = calc_global_metadata_size(fs_info);
4306 
4307  spin_lock(&sinfo->lock);
4308  spin_lock(&block_rsv->lock);
4309 
4310  block_rsv->size = num_bytes;
4311 
4312  num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4313  sinfo->bytes_reserved + sinfo->bytes_readonly +
4314  sinfo->bytes_may_use;
4315 
4316  if (sinfo->total_bytes > num_bytes) {
4317  num_bytes = sinfo->total_bytes - num_bytes;
4318  block_rsv->reserved += num_bytes;
4319  sinfo->bytes_may_use += num_bytes;
4320  trace_btrfs_space_reservation(fs_info, "space_info",
4321  sinfo->flags, num_bytes, 1);
4322  }
4323 
4324  if (block_rsv->reserved >= block_rsv->size) {
4325  num_bytes = block_rsv->reserved - block_rsv->size;
4326  sinfo->bytes_may_use -= num_bytes;
4327  trace_btrfs_space_reservation(fs_info, "space_info",
4328  sinfo->flags, num_bytes, 0);
4329  sinfo->reservation_progress++;
4330  block_rsv->reserved = block_rsv->size;
4331  block_rsv->full = 1;
4332  }
4333 
4334  spin_unlock(&block_rsv->lock);
4335  spin_unlock(&sinfo->lock);
4336 }
4337 
4338 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4339 {
4340  struct btrfs_space_info *space_info;
4341 
4342  space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4343  fs_info->chunk_block_rsv.space_info = space_info;
4344 
4345  space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4346  fs_info->global_block_rsv.space_info = space_info;
4347  fs_info->delalloc_block_rsv.space_info = space_info;
4348  fs_info->trans_block_rsv.space_info = space_info;
4349  fs_info->empty_block_rsv.space_info = space_info;
4350  fs_info->delayed_block_rsv.space_info = space_info;
4351 
4352  fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4353  fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4354  fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4355  fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4356  fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4357 
4358  update_global_block_rsv(fs_info);
4359 }
4360 
4361 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4362 {
4363  block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4364  (u64)-1);
4365  WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4366  WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4367  WARN_ON(fs_info->trans_block_rsv.size > 0);
4368  WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4369  WARN_ON(fs_info->chunk_block_rsv.size > 0);
4370  WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4371  WARN_ON(fs_info->delayed_block_rsv.size > 0);
4372  WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4373 }
4374 
4376  struct btrfs_root *root)
4377 {
4378  if (!trans->block_rsv)
4379  return;
4380 
4381  if (!trans->bytes_reserved)
4382  return;
4383 
4384  trace_btrfs_space_reservation(root->fs_info, "transaction",
4385  trans->transid, trans->bytes_reserved, 0);
4386  btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4387  trans->bytes_reserved = 0;
4388 }
4389 
4390 /* Can only return 0 or -ENOSPC */
4392  struct inode *inode)
4393 {
4394  struct btrfs_root *root = BTRFS_I(inode)->root;
4395  struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4396  struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4397 
4398  /*
4399  * We need to hold space in order to delete our orphan item once we've
4400  * added it, so this takes the reservation so we can release it later
4401  * when we are truly done with the orphan item.
4402  */
4403  u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4404  trace_btrfs_space_reservation(root->fs_info, "orphan",
4405  btrfs_ino(inode), num_bytes, 1);
4406  return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4407 }
4408 
4409 void btrfs_orphan_release_metadata(struct inode *inode)
4410 {
4411  struct btrfs_root *root = BTRFS_I(inode)->root;
4412  u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4413  trace_btrfs_space_reservation(root->fs_info, "orphan",
4414  btrfs_ino(inode), num_bytes, 0);
4415  btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4416 }
4417 
4419  struct btrfs_pending_snapshot *pending)
4420 {
4421  struct btrfs_root *root = pending->root;
4422  struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4423  struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4424  /*
4425  * two for root back/forward refs, two for directory entries,
4426  * one for root of the snapshot and one for parent inode.
4427  */
4428  u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4429  dst_rsv->space_info = src_rsv->space_info;
4430  return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4431 }
4432 
4442 static unsigned drop_outstanding_extent(struct inode *inode)
4443 {
4444  unsigned drop_inode_space = 0;
4445  unsigned dropped_extents = 0;
4446 
4447  BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4448  BTRFS_I(inode)->outstanding_extents--;
4449 
4450  if (BTRFS_I(inode)->outstanding_extents == 0 &&
4452  &BTRFS_I(inode)->runtime_flags))
4453  drop_inode_space = 1;
4454 
4455  /*
4456  * If we have more or the same amount of outsanding extents than we have
4457  * reserved then we need to leave the reserved extents count alone.
4458  */
4459  if (BTRFS_I(inode)->outstanding_extents >=
4460  BTRFS_I(inode)->reserved_extents)
4461  return drop_inode_space;
4462 
4463  dropped_extents = BTRFS_I(inode)->reserved_extents -
4464  BTRFS_I(inode)->outstanding_extents;
4465  BTRFS_I(inode)->reserved_extents -= dropped_extents;
4466  return dropped_extents + drop_inode_space;
4467 }
4468 
4487 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4488  int reserve)
4489 {
4490  struct btrfs_root *root = BTRFS_I(inode)->root;
4491  u64 csum_size;
4492  int num_csums_per_leaf;
4493  int num_csums;
4494  int old_csums;
4495 
4496  if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4497  BTRFS_I(inode)->csum_bytes == 0)
4498  return 0;
4499 
4500  old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4501  if (reserve)
4502  BTRFS_I(inode)->csum_bytes += num_bytes;
4503  else
4504  BTRFS_I(inode)->csum_bytes -= num_bytes;
4505  csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4506  num_csums_per_leaf = (int)div64_u64(csum_size,
4507  sizeof(struct btrfs_csum_item) +
4508  sizeof(struct btrfs_disk_key));
4509  num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4510  num_csums = num_csums + num_csums_per_leaf - 1;
4511  num_csums = num_csums / num_csums_per_leaf;
4512 
4513  old_csums = old_csums + num_csums_per_leaf - 1;
4514  old_csums = old_csums / num_csums_per_leaf;
4515 
4516  /* No change, no need to reserve more */
4517  if (old_csums == num_csums)
4518  return 0;
4519 
4520  if (reserve)
4521  return btrfs_calc_trans_metadata_size(root,
4522  num_csums - old_csums);
4523 
4524  return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4525 }
4526 
4527 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4528 {
4529  struct btrfs_root *root = BTRFS_I(inode)->root;
4530  struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4531  u64 to_reserve = 0;
4532  u64 csum_bytes;
4533  unsigned nr_extents = 0;
4534  int extra_reserve = 0;
4535  int flush = 1;
4536  int ret;
4537 
4538  /* Need to be holding the i_mutex here if we aren't free space cache */
4539  if (btrfs_is_free_space_inode(inode))
4540  flush = 0;
4541 
4542  if (flush && btrfs_transaction_in_commit(root->fs_info))
4543  schedule_timeout(1);
4544 
4545  mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4546  num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 
4548  spin_lock(&BTRFS_I(inode)->lock);
4549  BTRFS_I(inode)->outstanding_extents++;
4550 
4551  if (BTRFS_I(inode)->outstanding_extents >
4552  BTRFS_I(inode)->reserved_extents)
4553  nr_extents = BTRFS_I(inode)->outstanding_extents -
4554  BTRFS_I(inode)->reserved_extents;
4555 
4556  /*
4557  * Add an item to reserve for updating the inode when we complete the
4558  * delalloc io.
4559  */
4561  &BTRFS_I(inode)->runtime_flags)) {
4562  nr_extents++;
4563  extra_reserve = 1;
4564  }
4565 
4566  to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4567  to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4568  csum_bytes = BTRFS_I(inode)->csum_bytes;
4569  spin_unlock(&BTRFS_I(inode)->lock);
4570 
4571  if (root->fs_info->quota_enabled) {
4572  ret = btrfs_qgroup_reserve(root, num_bytes +
4573  nr_extents * root->leafsize);
4574  if (ret) {
4575  mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576  return ret;
4577  }
4578  }
4579 
4580  ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4581  if (ret) {
4582  u64 to_free = 0;
4583  unsigned dropped;
4584 
4585  spin_lock(&BTRFS_I(inode)->lock);
4586  dropped = drop_outstanding_extent(inode);
4587  /*
4588  * If the inodes csum_bytes is the same as the original
4589  * csum_bytes then we know we haven't raced with any free()ers
4590  * so we can just reduce our inodes csum bytes and carry on.
4591  * Otherwise we have to do the normal free thing to account for
4592  * the case that the free side didn't free up its reserve
4593  * because of this outstanding reservation.
4594  */
4595  if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4596  calc_csum_metadata_size(inode, num_bytes, 0);
4597  else
4598  to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4599  spin_unlock(&BTRFS_I(inode)->lock);
4600  if (dropped)
4601  to_free += btrfs_calc_trans_metadata_size(root, dropped);
4602 
4603  if (to_free) {
4604  btrfs_block_rsv_release(root, block_rsv, to_free);
4605  trace_btrfs_space_reservation(root->fs_info,
4606  "delalloc",
4607  btrfs_ino(inode),
4608  to_free, 0);
4609  }
4610  mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611  return ret;
4612  }
4613 
4614  spin_lock(&BTRFS_I(inode)->lock);
4615  if (extra_reserve) {
4617  &BTRFS_I(inode)->runtime_flags);
4618  nr_extents--;
4619  }
4620  BTRFS_I(inode)->reserved_extents += nr_extents;
4621  spin_unlock(&BTRFS_I(inode)->lock);
4622  mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 
4624  if (to_reserve)
4625  trace_btrfs_space_reservation(root->fs_info,"delalloc",
4626  btrfs_ino(inode), to_reserve, 1);
4627  block_rsv_add_bytes(block_rsv, to_reserve, 1);
4628 
4629  return 0;
4630 }
4631 
4641 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4642 {
4643  struct btrfs_root *root = BTRFS_I(inode)->root;
4644  u64 to_free = 0;
4645  unsigned dropped;
4646 
4647  num_bytes = ALIGN(num_bytes, root->sectorsize);
4648  spin_lock(&BTRFS_I(inode)->lock);
4649  dropped = drop_outstanding_extent(inode);
4650 
4651  to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4652  spin_unlock(&BTRFS_I(inode)->lock);
4653  if (dropped > 0)
4654  to_free += btrfs_calc_trans_metadata_size(root, dropped);
4655 
4656  trace_btrfs_space_reservation(root->fs_info, "delalloc",
4657  btrfs_ino(inode), to_free, 0);
4658  if (root->fs_info->quota_enabled) {
4659  btrfs_qgroup_free(root, num_bytes +
4660  dropped * root->leafsize);
4661  }
4662 
4663  btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4664  to_free);
4665 }
4666 
4682 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4683 {
4684  int ret;
4685 
4686  ret = btrfs_check_data_free_space(inode, num_bytes);
4687  if (ret)
4688  return ret;
4689 
4690  ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4691  if (ret) {
4692  btrfs_free_reserved_data_space(inode, num_bytes);
4693  return ret;
4694  }
4695 
4696  return 0;
4697 }
4698 
4712 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4713 {
4714  btrfs_delalloc_release_metadata(inode, num_bytes);
4715  btrfs_free_reserved_data_space(inode, num_bytes);
4716 }
4717 
4718 static int update_block_group(struct btrfs_trans_handle *trans,
4719  struct btrfs_root *root,
4720  u64 bytenr, u64 num_bytes, int alloc)
4721 {
4722  struct btrfs_block_group_cache *cache = NULL;
4723  struct btrfs_fs_info *info = root->fs_info;
4724  u64 total = num_bytes;
4725  u64 old_val;
4726  u64 byte_in_group;
4727  int factor;
4728 
4729  /* block accounting for super block */
4730  spin_lock(&info->delalloc_lock);
4731  old_val = btrfs_super_bytes_used(info->super_copy);
4732  if (alloc)
4733  old_val += num_bytes;
4734  else
4735  old_val -= num_bytes;
4736  btrfs_set_super_bytes_used(info->super_copy, old_val);
4737  spin_unlock(&info->delalloc_lock);
4738 
4739  while (total) {
4740  cache = btrfs_lookup_block_group(info, bytenr);
4741  if (!cache)
4742  return -ENOENT;
4743  if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4746  factor = 2;
4747  else
4748  factor = 1;
4749  /*
4750  * If this block group has free space cache written out, we
4751  * need to make sure to load it if we are removing space. This
4752  * is because we need the unpinning stage to actually add the
4753  * space back to the block group, otherwise we will leak space.
4754  */
4755  if (!alloc && cache->cached == BTRFS_CACHE_NO)
4756  cache_block_group(cache, trans, NULL, 1);
4757 
4758  byte_in_group = bytenr - cache->key.objectid;
4759  WARN_ON(byte_in_group > cache->key.offset);
4760 
4761  spin_lock(&cache->space_info->lock);
4762  spin_lock(&cache->lock);
4763 
4764  if (btrfs_test_opt(root, SPACE_CACHE) &&
4767 
4768  cache->dirty = 1;
4769  old_val = btrfs_block_group_used(&cache->item);
4770  num_bytes = min(total, cache->key.offset - byte_in_group);
4771  if (alloc) {
4772  old_val += num_bytes;
4773  btrfs_set_block_group_used(&cache->item, old_val);
4774  cache->reserved -= num_bytes;
4775  cache->space_info->bytes_reserved -= num_bytes;
4776  cache->space_info->bytes_used += num_bytes;
4777  cache->space_info->disk_used += num_bytes * factor;
4778  spin_unlock(&cache->lock);
4779  spin_unlock(&cache->space_info->lock);
4780  } else {
4781  old_val -= num_bytes;
4782  btrfs_set_block_group_used(&cache->item, old_val);
4783  cache->pinned += num_bytes;
4784  cache->space_info->bytes_pinned += num_bytes;
4785  cache->space_info->bytes_used -= num_bytes;
4786  cache->space_info->disk_used -= num_bytes * factor;
4787  spin_unlock(&cache->lock);
4788  spin_unlock(&cache->space_info->lock);
4789 
4791  bytenr, bytenr + num_bytes - 1,
4793  }
4794  btrfs_put_block_group(cache);
4795  total -= num_bytes;
4796  bytenr += num_bytes;
4797  }
4798  return 0;
4799 }
4800 
4801 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4802 {
4804  u64 bytenr;
4805 
4806  cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4807  if (!cache)
4808  return 0;
4809 
4810  bytenr = cache->key.objectid;
4811  btrfs_put_block_group(cache);
4812 
4813  return bytenr;
4814 }
4815 
4816 static int pin_down_extent(struct btrfs_root *root,
4817  struct btrfs_block_group_cache *cache,
4818  u64 bytenr, u64 num_bytes, int reserved)
4819 {
4820  spin_lock(&cache->space_info->lock);
4821  spin_lock(&cache->lock);
4822  cache->pinned += num_bytes;
4823  cache->space_info->bytes_pinned += num_bytes;
4824  if (reserved) {
4825  cache->reserved -= num_bytes;
4826  cache->space_info->bytes_reserved -= num_bytes;
4827  }
4828  spin_unlock(&cache->lock);
4829  spin_unlock(&cache->space_info->lock);
4830 
4831  set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4832  bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4833  return 0;
4834 }
4835 
4836 /*
4837  * this function must be called within transaction
4838  */
4839 int btrfs_pin_extent(struct btrfs_root *root,
4840  u64 bytenr, u64 num_bytes, int reserved)
4841 {
4843 
4844  cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4845  BUG_ON(!cache); /* Logic error */
4846 
4847  pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4848 
4849  btrfs_put_block_group(cache);
4850  return 0;
4851 }
4852 
4853 /*
4854  * this function must be called within transaction
4855  */
4857  struct btrfs_root *root,
4858  u64 bytenr, u64 num_bytes)
4859 {
4861 
4862  cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4863  BUG_ON(!cache); /* Logic error */
4864 
4865  /*
4866  * pull in the free space cache (if any) so that our pin
4867  * removes the free space from the cache. We have load_only set
4868  * to one because the slow code to read in the free extents does check
4869  * the pinned extents.
4870  */
4871  cache_block_group(cache, trans, root, 1);
4872 
4873  pin_down_extent(root, cache, bytenr, num_bytes, 0);
4874 
4875  /* remove us from the free space cache (if we're there at all) */
4876  btrfs_remove_free_space(cache, bytenr, num_bytes);
4877  btrfs_put_block_group(cache);
4878  return 0;
4879 }
4880 
4903 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4904  u64 num_bytes, int reserve)
4905 {
4906  struct btrfs_space_info *space_info = cache->space_info;
4907  int ret = 0;
4908 
4909  spin_lock(&space_info->lock);
4910  spin_lock(&cache->lock);
4911  if (reserve != RESERVE_FREE) {
4912  if (cache->ro) {
4913  ret = -EAGAIN;
4914  } else {
4915  cache->reserved += num_bytes;
4916  space_info->bytes_reserved += num_bytes;
4917  if (reserve == RESERVE_ALLOC) {
4918  trace_btrfs_space_reservation(cache->fs_info,
4919  "space_info", space_info->flags,
4920  num_bytes, 0);
4921  space_info->bytes_may_use -= num_bytes;
4922  }
4923  }
4924  } else {
4925  if (cache->ro)
4926  space_info->bytes_readonly += num_bytes;
4927  cache->reserved -= num_bytes;
4928  space_info->bytes_reserved -= num_bytes;
4929  space_info->reservation_progress++;
4930  }
4931  spin_unlock(&cache->lock);
4932  spin_unlock(&space_info->lock);
4933  return ret;
4934 }
4935 
4937  struct btrfs_root *root)
4938 {
4939  struct btrfs_fs_info *fs_info = root->fs_info;
4940  struct btrfs_caching_control *next;
4941  struct btrfs_caching_control *caching_ctl;
4943 
4944  down_write(&fs_info->extent_commit_sem);
4945 
4946  list_for_each_entry_safe(caching_ctl, next,
4947  &fs_info->caching_block_groups, list) {
4948  cache = caching_ctl->block_group;
4949  if (block_group_cache_done(cache)) {
4950  cache->last_byte_to_unpin = (u64)-1;
4951  list_del_init(&caching_ctl->list);
4952  put_caching_control(caching_ctl);
4953  } else {
4954  cache->last_byte_to_unpin = caching_ctl->progress;
4955  }
4956  }
4957 
4958  if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4959  fs_info->pinned_extents = &fs_info->freed_extents[1];
4960  else
4961  fs_info->pinned_extents = &fs_info->freed_extents[0];
4962 
4963  up_write(&fs_info->extent_commit_sem);
4964 
4965  update_global_block_rsv(fs_info);
4966 }
4967 
4968 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969 {
4970  struct btrfs_fs_info *fs_info = root->fs_info;
4971  struct btrfs_block_group_cache *cache = NULL;
4972  u64 len;
4973 
4974  while (start <= end) {
4975  if (!cache ||
4976  start >= cache->key.objectid + cache->key.offset) {
4977  if (cache)
4978  btrfs_put_block_group(cache);
4979  cache = btrfs_lookup_block_group(fs_info, start);
4980  BUG_ON(!cache); /* Logic error */
4981  }
4982 
4983  len = cache->key.objectid + cache->key.offset - start;
4984  len = min(len, end + 1 - start);
4985 
4986  if (start < cache->last_byte_to_unpin) {
4987  len = min(len, cache->last_byte_to_unpin - start);
4988  btrfs_add_free_space(cache, start, len);
4989  }
4990 
4991  start += len;
4992 
4993  spin_lock(&cache->space_info->lock);
4994  spin_lock(&cache->lock);
4995  cache->pinned -= len;
4996  cache->space_info->bytes_pinned -= len;
4997  if (cache->ro)
4998  cache->space_info->bytes_readonly += len;
4999  spin_unlock(&cache->lock);
5000  spin_unlock(&cache->space_info->lock);
5001  }
5002 
5003  if (cache)
5004  btrfs_put_block_group(cache);
5005  return 0;
5006 }
5007 
5009  struct btrfs_root *root)
5010 {
5011  struct btrfs_fs_info *fs_info = root->fs_info;
5012  struct extent_io_tree *unpin;
5013  u64 start;
5014  u64 end;
5015  int ret;
5016 
5017  if (trans->aborted)
5018  return 0;
5019 
5020  if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5021  unpin = &fs_info->freed_extents[1];
5022  else
5023  unpin = &fs_info->freed_extents[0];
5024 
5025  while (1) {
5026  ret = find_first_extent_bit(unpin, 0, &start, &end,
5027  EXTENT_DIRTY, NULL);
5028  if (ret)
5029  break;
5030 
5031  if (btrfs_test_opt(root, DISCARD))
5032  ret = btrfs_discard_extent(root, start,
5033  end + 1 - start, NULL);
5034 
5035  clear_extent_dirty(unpin, start, end, GFP_NOFS);
5036  unpin_extent_range(root, start, end);
5037  cond_resched();
5038  }
5039 
5040  return 0;
5041 }
5042 
5043 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5044  struct btrfs_root *root,
5045  u64 bytenr, u64 num_bytes, u64 parent,
5046  u64 root_objectid, u64 owner_objectid,
5047  u64 owner_offset, int refs_to_drop,
5048  struct btrfs_delayed_extent_op *extent_op)
5049 {
5050  struct btrfs_key key;
5051  struct btrfs_path *path;
5052  struct btrfs_fs_info *info = root->fs_info;
5053  struct btrfs_root *extent_root = info->extent_root;
5054  struct extent_buffer *leaf;
5055  struct btrfs_extent_item *ei;
5056  struct btrfs_extent_inline_ref *iref;
5057  int ret;
5058  int is_data;
5059  int extent_slot = 0;
5060  int found_extent = 0;
5061  int num_to_del = 1;
5062  u32 item_size;
5063  u64 refs;
5064 
5065  path = btrfs_alloc_path();
5066  if (!path)
5067  return -ENOMEM;
5068 
5069  path->reada = 1;
5070  path->leave_spinning = 1;
5071 
5072  is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5073  BUG_ON(!is_data && refs_to_drop != 1);
5074 
5075  ret = lookup_extent_backref(trans, extent_root, path, &iref,
5076  bytenr, num_bytes, parent,
5077  root_objectid, owner_objectid,
5078  owner_offset);
5079  if (ret == 0) {
5080  extent_slot = path->slots[0];
5081  while (extent_slot >= 0) {
5082  btrfs_item_key_to_cpu(path->nodes[0], &key,
5083  extent_slot);
5084  if (key.objectid != bytenr)
5085  break;
5086  if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5087  key.offset == num_bytes) {
5088  found_extent = 1;
5089  break;
5090  }
5091  if (path->slots[0] - extent_slot > 5)
5092  break;
5093  extent_slot--;
5094  }
5095 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5096  item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5097  if (found_extent && item_size < sizeof(*ei))
5098  found_extent = 0;
5099 #endif
5100  if (!found_extent) {
5101  BUG_ON(iref);
5102  ret = remove_extent_backref(trans, extent_root, path,
5103  NULL, refs_to_drop,
5104  is_data);
5105  if (ret) {
5106  btrfs_abort_transaction(trans, extent_root, ret);
5107  goto out;
5108  }
5109  btrfs_release_path(path);
5110  path->leave_spinning = 1;
5111 
5112  key.objectid = bytenr;
5114  key.offset = num_bytes;
5115 
5116  ret = btrfs_search_slot(trans, extent_root,
5117  &key, path, -1, 1);
5118  if (ret) {
5119  printk(KERN_ERR "umm, got %d back from search"
5120  ", was looking for %llu\n", ret,
5121  (unsigned long long)bytenr);
5122  if (ret > 0)
5123  btrfs_print_leaf(extent_root,
5124  path->nodes[0]);
5125  }
5126  if (ret < 0) {
5127  btrfs_abort_transaction(trans, extent_root, ret);
5128  goto out;
5129  }
5130  extent_slot = path->slots[0];
5131  }
5132  } else if (ret == -ENOENT) {
5133  btrfs_print_leaf(extent_root, path->nodes[0]);
5134  WARN_ON(1);
5135  printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5136  "parent %llu root %llu owner %llu offset %llu\n",
5137  (unsigned long long)bytenr,
5138  (unsigned long long)parent,
5139  (unsigned long long)root_objectid,
5140  (unsigned long long)owner_objectid,
5141  (unsigned long long)owner_offset);
5142  } else {
5143  btrfs_abort_transaction(trans, extent_root, ret);
5144  goto out;
5145  }
5146 
5147  leaf = path->nodes[0];
5148  item_size = btrfs_item_size_nr(leaf, extent_slot);
5149 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5150  if (item_size < sizeof(*ei)) {
5151  BUG_ON(found_extent || extent_slot != path->slots[0]);
5152  ret = convert_extent_item_v0(trans, extent_root, path,
5153  owner_objectid, 0);
5154  if (ret < 0) {
5155  btrfs_abort_transaction(trans, extent_root, ret);
5156  goto out;
5157  }
5158 
5159  btrfs_release_path(path);
5160  path->leave_spinning = 1;
5161 
5162  key.objectid = bytenr;
5164  key.offset = num_bytes;
5165 
5166  ret = btrfs_search_slot(trans, extent_root, &key, path,
5167  -1, 1);
5168  if (ret) {
5169  printk(KERN_ERR "umm, got %d back from search"
5170  ", was looking for %llu\n", ret,
5171  (unsigned long long)bytenr);
5172  btrfs_print_leaf(extent_root, path->nodes[0]);
5173  }
5174  if (ret < 0) {
5175  btrfs_abort_transaction(trans, extent_root, ret);
5176  goto out;
5177  }
5178 
5179  extent_slot = path->slots[0];
5180  leaf = path->nodes[0];
5181  item_size = btrfs_item_size_nr(leaf, extent_slot);
5182  }
5183 #endif
5184  BUG_ON(item_size < sizeof(*ei));
5185  ei = btrfs_item_ptr(leaf, extent_slot,
5186  struct btrfs_extent_item);
5187  if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5188  struct btrfs_tree_block_info *bi;
5189  BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5190  bi = (struct btrfs_tree_block_info *)(ei + 1);
5191  WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5192  }
5193 
5194  refs = btrfs_extent_refs(leaf, ei);
5195  BUG_ON(refs < refs_to_drop);
5196  refs -= refs_to_drop;
5197 
5198  if (refs > 0) {
5199  if (extent_op)
5200  __run_delayed_extent_op(extent_op, leaf, ei);
5201  /*
5202  * In the case of inline back ref, reference count will
5203  * be updated by remove_extent_backref
5204  */
5205  if (iref) {
5206  BUG_ON(!found_extent);
5207  } else {
5208  btrfs_set_extent_refs(leaf, ei, refs);
5210  }
5211  if (found_extent) {
5212  ret = remove_extent_backref(trans, extent_root, path,
5213  iref, refs_to_drop,
5214  is_data);
5215  if (ret) {
5216  btrfs_abort_transaction(trans, extent_root, ret);
5217  goto out;
5218  }
5219  }
5220  } else {
5221  if (found_extent) {
5222  BUG_ON(is_data && refs_to_drop !=
5223  extent_data_ref_count(root, path, iref));
5224  if (iref) {
5225  BUG_ON(path->slots[0] != extent_slot);
5226  } else {
5227  BUG_ON(path->slots[0] != extent_slot + 1);
5228  path->slots[0] = extent_slot;
5229  num_to_del = 2;
5230  }
5231  }
5232 
5233  ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5234  num_to_del);
5235  if (ret) {
5236  btrfs_abort_transaction(trans, extent_root, ret);
5237  goto out;
5238  }
5239  btrfs_release_path(path);
5240 
5241  if (is_data) {
5242  ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5243  if (ret) {
5244  btrfs_abort_transaction(trans, extent_root, ret);
5245  goto out;
5246  }
5247  }
5248 
5249  ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5250  if (ret) {
5251  btrfs_abort_transaction(trans, extent_root, ret);
5252  goto out;
5253  }
5254  }
5255 out:
5256  btrfs_free_path(path);
5257  return ret;
5258 }
5259 
5260 /*
5261  * when we free an block, it is possible (and likely) that we free the last
5262  * delayed ref for that extent as well. This searches the delayed ref tree for
5263  * a given extent, and if there are no other delayed refs to be processed, it
5264  * removes it from the tree.
5265  */
5266 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5267  struct btrfs_root *root, u64 bytenr)
5268 {
5269  struct btrfs_delayed_ref_head *head;
5270  struct btrfs_delayed_ref_root *delayed_refs;
5271  struct btrfs_delayed_ref_node *ref;
5272  struct rb_node *node;
5273  int ret = 0;
5274 
5275  delayed_refs = &trans->transaction->delayed_refs;
5276  spin_lock(&delayed_refs->lock);
5277  head = btrfs_find_delayed_ref_head(trans, bytenr);
5278  if (!head)
5279  goto out;
5280 
5281  node = rb_prev(&head->node.rb_node);
5282  if (!node)
5283  goto out;
5284 
5285  ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5286 
5287  /* there are still entries for this ref, we can't drop it */
5288  if (ref->bytenr == bytenr)
5289  goto out;
5290 
5291  if (head->extent_op) {
5292  if (!head->must_insert_reserved)
5293  goto out;
5294  kfree(head->extent_op);
5295  head->extent_op = NULL;
5296  }
5297 
5298  /*
5299  * waiting for the lock here would deadlock. If someone else has it
5300  * locked they are already in the process of dropping it anyway
5301  */
5302  if (!mutex_trylock(&head->mutex))
5303  goto out;
5304 
5305  /*
5306  * at this point we have a head with no other entries. Go
5307  * ahead and process it.
5308  */
5309  head->node.in_tree = 0;
5310  rb_erase(&head->node.rb_node, &delayed_refs->root);
5311 
5312  delayed_refs->num_entries--;
5313 
5314  /*
5315  * we don't take a ref on the node because we're removing it from the
5316  * tree, so we just steal the ref the tree was holding.
5317  */
5318  delayed_refs->num_heads--;
5319  if (list_empty(&head->cluster))
5320  delayed_refs->num_heads_ready--;
5321 
5322  list_del_init(&head->cluster);
5323  spin_unlock(&delayed_refs->lock);
5324 
5325  BUG_ON(head->extent_op);
5326  if (head->must_insert_reserved)
5327  ret = 1;
5328 
5329  mutex_unlock(&head->mutex);
5330  btrfs_put_delayed_ref(&head->node);
5331  return ret;
5332 out:
5333  spin_unlock(&delayed_refs->lock);
5334  return 0;
5335 }
5336 
5338  struct btrfs_root *root,
5339  struct extent_buffer *buf,
5340  u64 parent, int last_ref)
5341 {
5342  struct btrfs_block_group_cache *cache = NULL;
5343  int ret;
5344 
5345  if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5346  ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5347  buf->start, buf->len,
5348  parent, root->root_key.objectid,
5349  btrfs_header_level(buf),
5351  BUG_ON(ret); /* -ENOMEM */
5352  }
5353 
5354  if (!last_ref)
5355  return;
5356 
5357  cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5358 
5359  if (btrfs_header_generation(buf) == trans->transid) {
5360  if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5361  ret = check_ref_cleanup(trans, root, buf->start);
5362  if (!ret)
5363  goto out;
5364  }
5365 
5366  if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5367  pin_down_extent(root, cache, buf->start, buf->len, 1);
5368  goto out;
5369  }
5370 
5372 
5373  btrfs_add_free_space(cache, buf->start, buf->len);
5374  btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5375  }
5376 out:
5377  /*
5378  * Deleting the buffer, clear the corrupt flag since it doesn't matter
5379  * anymore.
5380  */
5382  btrfs_put_block_group(cache);
5383 }
5384 
5385 /* Can return -ENOMEM */
5386 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5387  u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5388  u64 owner, u64 offset, int for_cow)
5389 {
5390  int ret;
5391  struct btrfs_fs_info *fs_info = root->fs_info;
5392 
5393  /*
5394  * tree log blocks never actually go into the extent allocation
5395  * tree, just update pinning info and exit early.
5396  */
5397  if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5399  /* unlocks the pinned mutex */
5400  btrfs_pin_extent(root, bytenr, num_bytes, 1);
5401  ret = 0;
5402  } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5403  ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5404  num_bytes,
5405  parent, root_objectid, (int)owner,
5406  BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5407  } else {
5408  ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5409  num_bytes,
5410  parent, root_objectid, owner,
5411  offset, BTRFS_DROP_DELAYED_REF,
5412  NULL, for_cow);
5413  }
5414  return ret;
5415 }
5416 
5417 static u64 stripe_align(struct btrfs_root *root, u64 val)
5418 {
5419  u64 mask = ((u64)root->stripesize - 1);
5420  u64 ret = (val + mask) & ~mask;
5421  return ret;
5422 }
5423 
5424 /*
5425  * when we wait for progress in the block group caching, its because
5426  * our allocation attempt failed at least once. So, we must sleep
5427  * and let some progress happen before we try again.
5428  *
5429  * This function will sleep at least once waiting for new free space to
5430  * show up, and then it will check the block group free space numbers
5431  * for our min num_bytes. Another option is to have it go ahead
5432  * and look in the rbtree for a free extent of a given size, but this
5433  * is a good start.
5434  */
5435 static noinline int
5436 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5437  u64 num_bytes)
5438 {
5439  struct btrfs_caching_control *caching_ctl;
5440  DEFINE_WAIT(wait);
5441 
5442  caching_ctl = get_caching_control(cache);
5443  if (!caching_ctl)
5444  return 0;
5445 
5446  wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5447  (cache->free_space_ctl->free_space >= num_bytes));
5448 
5449  put_caching_control(caching_ctl);
5450  return 0;
5451 }
5452 
5453 static noinline int
5454 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5455 {
5456  struct btrfs_caching_control *caching_ctl;
5457  DEFINE_WAIT(wait);
5458 
5459  caching_ctl = get_caching_control(cache);
5460  if (!caching_ctl)
5461  return 0;
5462 
5463  wait_event(caching_ctl->wait, block_group_cache_done(cache));
5464 
5465  put_caching_control(caching_ctl);
5466  return 0;
5467 }
5468 
5469 static int __get_block_group_index(u64 flags)
5470 {
5471  int index;
5472 
5473  if (flags & BTRFS_BLOCK_GROUP_RAID10)
5474  index = 0;
5475  else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5476  index = 1;
5477  else if (flags & BTRFS_BLOCK_GROUP_DUP)
5478  index = 2;
5479  else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5480  index = 3;
5481  else
5482  index = 4;
5483 
5484  return index;
5485 }
5486 
5487 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488 {
5489  return __get_block_group_index(cache->flags);
5490 }
5491 
5497 };
5498 
5499 /*
5500  * walks the btree of allocated extents and find a hole of a given size.
5501  * The key ins is changed to record the hole:
5502  * ins->objectid == block start
5503  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5504  * ins->offset == number of blocks
5505  * Any available blocks before search_start are skipped.
5506  */
5507 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5508  struct btrfs_root *orig_root,
5509  u64 num_bytes, u64 empty_size,
5510  u64 hint_byte, struct btrfs_key *ins,
5511  u64 data)
5512 {
5513  int ret = 0;
5514  struct btrfs_root *root = orig_root->fs_info->extent_root;
5515  struct btrfs_free_cluster *last_ptr = NULL;
5516  struct btrfs_block_group_cache *block_group = NULL;
5517  struct btrfs_block_group_cache *used_block_group;
5518  u64 search_start = 0;
5519  int empty_cluster = 2 * 1024 * 1024;
5520  struct btrfs_space_info *space_info;
5521  int loop = 0;
5522  int index = 0;
5523  int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5525  bool found_uncached_bg = false;
5526  bool failed_cluster_refill = false;
5527  bool failed_alloc = false;
5528  bool use_cluster = true;
5529  bool have_caching_bg = false;
5530 
5531  WARN_ON(num_bytes < root->sectorsize);
5532  btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5533  ins->objectid = 0;
5534  ins->offset = 0;
5535 
5536  trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5537 
5538  space_info = __find_space_info(root->fs_info, data);
5539  if (!space_info) {
5540  printk(KERN_ERR "No space info for %llu\n", data);
5541  return -ENOSPC;
5542  }
5543 
5544  /*
5545  * If the space info is for both data and metadata it means we have a
5546  * small filesystem and we can't use the clustering stuff.
5547  */
5548  if (btrfs_mixed_space_info(space_info))
5549  use_cluster = false;
5550 
5551  if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5552  last_ptr = &root->fs_info->meta_alloc_cluster;
5553  if (!btrfs_test_opt(root, SSD))
5554  empty_cluster = 64 * 1024;
5555  }
5556 
5557  if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5558  btrfs_test_opt(root, SSD)) {
5559  last_ptr = &root->fs_info->data_alloc_cluster;
5560  }
5561 
5562  if (last_ptr) {
5563  spin_lock(&last_ptr->lock);
5564  if (last_ptr->block_group)
5565  hint_byte = last_ptr->window_start;
5566  spin_unlock(&last_ptr->lock);
5567  }
5568 
5569  search_start = max(search_start, first_logical_byte(root, 0));
5570  search_start = max(search_start, hint_byte);
5571 
5572  if (!last_ptr)
5573  empty_cluster = 0;
5574 
5575  if (search_start == hint_byte) {
5576  block_group = btrfs_lookup_block_group(root->fs_info,
5577  search_start);
5578  used_block_group = block_group;
5579  /*
5580  * we don't want to use the block group if it doesn't match our
5581  * allocation bits, or if its not cached.
5582  *
5583  * However if we are re-searching with an ideal block group
5584  * picked out then we don't care that the block group is cached.
5585  */
5586  if (block_group && block_group_bits(block_group, data) &&
5587  block_group->cached != BTRFS_CACHE_NO) {
5588  down_read(&space_info->groups_sem);
5589  if (list_empty(&block_group->list) ||
5590  block_group->ro) {
5591  /*
5592  * someone is removing this block group,
5593  * we can't jump into the have_block_group
5594  * target because our list pointers are not
5595  * valid
5596  */
5597  btrfs_put_block_group(block_group);
5598  up_read(&space_info->groups_sem);
5599  } else {
5600  index = get_block_group_index(block_group);
5601  goto have_block_group;
5602  }
5603  } else if (block_group) {
5604  btrfs_put_block_group(block_group);
5605  }
5606  }
5607 search:
5608  have_caching_bg = false;
5609  down_read(&space_info->groups_sem);
5610  list_for_each_entry(block_group, &space_info->block_groups[index],
5611  list) {
5612  u64 offset;
5613  int cached;
5614 
5615  used_block_group = block_group;
5616  btrfs_get_block_group(block_group);
5617  search_start = block_group->key.objectid;
5618 
5619  /*
5620  * this can happen if we end up cycling through all the
5621  * raid types, but we want to make sure we only allocate
5622  * for the proper type.
5623  */
5624  if (!block_group_bits(block_group, data)) {
5625  u64 extra = BTRFS_BLOCK_GROUP_DUP |
5626  BTRFS_BLOCK_GROUP_RAID1 |
5628 
5629  /*
5630  * if they asked for extra copies and this block group
5631  * doesn't provide them, bail. This does allow us to
5632  * fill raid0 from raid1.
5633  */
5634  if ((data & extra) && !(block_group->flags & extra))
5635  goto loop;
5636  }
5637 
5638 have_block_group:
5639  cached = block_group_cache_done(block_group);
5640  if (unlikely(!cached)) {
5641  found_uncached_bg = true;
5642  ret = cache_block_group(block_group, trans,
5643  orig_root, 0);
5644  BUG_ON(ret < 0);
5645  ret = 0;
5646  }
5647 
5648  if (unlikely(block_group->ro))
5649  goto loop;
5650 
5651  /*
5652  * Ok we want to try and use the cluster allocator, so
5653  * lets look there
5654  */
5655  if (last_ptr) {
5656  /*
5657  * the refill lock keeps out other
5658  * people trying to start a new cluster
5659  */
5660  spin_lock(&last_ptr->refill_lock);
5661  used_block_group = last_ptr->block_group;
5662  if (used_block_group != block_group &&
5663  (!used_block_group ||
5664  used_block_group->ro ||
5665  !block_group_bits(used_block_group, data))) {
5666  used_block_group = block_group;
5667  goto refill_cluster;
5668  }
5669 
5670  if (used_block_group != block_group)
5671  btrfs_get_block_group(used_block_group);
5672 
5673  offset = btrfs_alloc_from_cluster(used_block_group,
5674  last_ptr, num_bytes, used_block_group->key.objectid);
5675  if (offset) {
5676  /* we have a block, we're done */
5677  spin_unlock(&last_ptr->refill_lock);
5678  trace_btrfs_reserve_extent_cluster(root,
5679  block_group, search_start, num_bytes);
5680  goto checks;
5681  }
5682 
5683  WARN_ON(last_ptr->block_group != used_block_group);
5684  if (used_block_group != block_group) {
5685  btrfs_put_block_group(used_block_group);
5686  used_block_group = block_group;
5687  }
5688 refill_cluster:
5689  BUG_ON(used_block_group != block_group);
5690  /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5691  * set up a new clusters, so lets just skip it
5692  * and let the allocator find whatever block
5693  * it can find. If we reach this point, we
5694  * will have tried the cluster allocator
5695  * plenty of times and not have found
5696  * anything, so we are likely way too
5697  * fragmented for the clustering stuff to find
5698  * anything.
5699  *
5700  * However, if the cluster is taken from the
5701  * current block group, release the cluster
5702  * first, so that we stand a better chance of
5703  * succeeding in the unclustered
5704  * allocation. */
5705  if (loop >= LOOP_NO_EMPTY_SIZE &&
5706  last_ptr->block_group != block_group) {
5707  spin_unlock(&last_ptr->refill_lock);
5708  goto unclustered_alloc;
5709  }
5710 
5711  /*
5712  * this cluster didn't work out, free it and
5713  * start over
5714  */
5716 
5717  if (loop >= LOOP_NO_EMPTY_SIZE) {
5718  spin_unlock(&last_ptr->refill_lock);
5719  goto unclustered_alloc;
5720  }
5721 
5722  /* allocate a cluster in this block group */
5723  ret = btrfs_find_space_cluster(trans, root,
5724  block_group, last_ptr,
5725  search_start, num_bytes,
5726  empty_cluster + empty_size);
5727  if (ret == 0) {
5728  /*
5729  * now pull our allocation out of this
5730  * cluster
5731  */
5732  offset = btrfs_alloc_from_cluster(block_group,
5733  last_ptr, num_bytes,
5734  search_start);
5735  if (offset) {
5736  /* we found one, proceed */
5737  spin_unlock(&last_ptr->refill_lock);
5738  trace_btrfs_reserve_extent_cluster(root,
5739  block_group, search_start,
5740  num_bytes);
5741  goto checks;
5742  }
5743  } else if (!cached && loop > LOOP_CACHING_NOWAIT
5744  && !failed_cluster_refill) {
5745  spin_unlock(&last_ptr->refill_lock);
5746 
5747  failed_cluster_refill = true;
5748  wait_block_group_cache_progress(block_group,
5749  num_bytes + empty_cluster + empty_size);
5750  goto have_block_group;
5751  }
5752 
5753  /*
5754  * at this point we either didn't find a cluster
5755  * or we weren't able to allocate a block from our
5756  * cluster. Free the cluster we've been trying
5757  * to use, and go to the next block group
5758  */
5760  spin_unlock(&last_ptr->refill_lock);
5761  goto loop;
5762  }
5763 
5764 unclustered_alloc:
5765  spin_lock(&block_group->free_space_ctl->tree_lock);
5766  if (cached &&
5767  block_group->free_space_ctl->free_space <
5768  num_bytes + empty_cluster + empty_size) {
5769  spin_unlock(&block_group->free_space_ctl->tree_lock);
5770  goto loop;
5771  }
5772  spin_unlock(&block_group->free_space_ctl->tree_lock);
5773 
5774  offset = btrfs_find_space_for_alloc(block_group, search_start,
5775  num_bytes, empty_size);
5776  /*
5777  * If we didn't find a chunk, and we haven't failed on this
5778  * block group before, and this block group is in the middle of
5779  * caching and we are ok with waiting, then go ahead and wait
5780  * for progress to be made, and set failed_alloc to true.
5781  *
5782  * If failed_alloc is true then we've already waited on this
5783  * block group once and should move on to the next block group.
5784  */
5785  if (!offset && !failed_alloc && !cached &&
5786  loop > LOOP_CACHING_NOWAIT) {
5787  wait_block_group_cache_progress(block_group,
5788  num_bytes + empty_size);
5789  failed_alloc = true;
5790  goto have_block_group;
5791  } else if (!offset) {
5792  if (!cached)
5793  have_caching_bg = true;
5794  goto loop;
5795  }
5796 checks:
5797  search_start = stripe_align(root, offset);
5798 
5799  /* move on to the next group */
5800  if (search_start + num_bytes >
5801  used_block_group->key.objectid + used_block_group->key.offset) {
5802  btrfs_add_free_space(used_block_group, offset, num_bytes);
5803  goto loop;
5804  }
5805 
5806  if (offset < search_start)
5807  btrfs_add_free_space(used_block_group, offset,
5808  search_start - offset);
5809  BUG_ON(offset > search_start);
5810 
5811  ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5812  alloc_type);
5813  if (ret == -EAGAIN) {
5814  btrfs_add_free_space(used_block_group, offset, num_bytes);
5815  goto loop;
5816  }
5817 
5818  /* we are all good, lets return */
5819  ins->objectid = search_start;
5820  ins->offset = num_bytes;
5821 
5822  trace_btrfs_reserve_extent(orig_root, block_group,
5823  search_start, num_bytes);
5824  if (used_block_group != block_group)
5825  btrfs_put_block_group(used_block_group);
5826  btrfs_put_block_group(block_group);
5827  break;
5828 loop:
5829  failed_cluster_refill = false;
5830  failed_alloc = false;
5831  BUG_ON(index != get_block_group_index(block_group));
5832  if (used_block_group != block_group)
5833  btrfs_put_block_group(used_block_group);
5834  btrfs_put_block_group(block_group);
5835  }
5836  up_read(&space_info->groups_sem);
5837 
5838  if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5839  goto search;
5840 
5841  if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5842  goto search;
5843 
5844  /*
5845  * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5846  * caching kthreads as we move along
5847  * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5848  * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5849  * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5850  * again
5851  */
5852  if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5853  index = 0;
5854  loop++;
5855  if (loop == LOOP_ALLOC_CHUNK) {
5856  ret = do_chunk_alloc(trans, root, data,
5858  /*
5859  * Do not bail out on ENOSPC since we
5860  * can do more things.
5861  */
5862  if (ret < 0 && ret != -ENOSPC) {
5864  root, ret);
5865  goto out;
5866  }
5867  }
5868 
5869  if (loop == LOOP_NO_EMPTY_SIZE) {
5870  empty_size = 0;
5871  empty_cluster = 0;
5872  }
5873 
5874  goto search;
5875  } else if (!ins->objectid) {
5876  ret = -ENOSPC;
5877  } else if (ins->objectid) {
5878  ret = 0;
5879  }
5880 out:
5881 
5882  return ret;
5883 }
5884 
5885 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5886  int dump_block_groups)
5887 {
5889  int index = 0;
5890 
5891  spin_lock(&info->lock);
5892  printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5893  (unsigned long long)info->flags,
5894  (unsigned long long)(info->total_bytes - info->bytes_used -
5895  info->bytes_pinned - info->bytes_reserved -
5896  info->bytes_readonly),
5897  (info->full) ? "" : "not ");
5898  printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5899  "reserved=%llu, may_use=%llu, readonly=%llu\n",
5900  (unsigned long long)info->total_bytes,
5901  (unsigned long long)info->bytes_used,
5902  (unsigned long long)info->bytes_pinned,
5903  (unsigned long long)info->bytes_reserved,
5904  (unsigned long long)info->bytes_may_use,
5905  (unsigned long long)info->bytes_readonly);
5906  spin_unlock(&info->lock);
5907 
5908  if (!dump_block_groups)
5909  return;
5910 
5911  down_read(&info->groups_sem);
5912 again:
5913  list_for_each_entry(cache, &info->block_groups[index], list) {
5914  spin_lock(&cache->lock);
5915  printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5916  (unsigned long long)cache->key.objectid,
5917  (unsigned long long)cache->key.offset,
5918  (unsigned long long)btrfs_block_group_used(&cache->item),
5919  (unsigned long long)cache->pinned,
5920  (unsigned long long)cache->reserved,
5921  cache->ro ? "[readonly]" : "");
5922  btrfs_dump_free_space(cache, bytes);
5923  spin_unlock(&cache->lock);
5924  }
5925  if (++index < BTRFS_NR_RAID_TYPES)
5926  goto again;
5927  up_read(&info->groups_sem);
5928 }
5929 
5931  struct btrfs_root *root,
5932  u64 num_bytes, u64 min_alloc_size,
5933  u64 empty_size, u64 hint_byte,
5934  struct btrfs_key *ins, u64 data)
5935 {
5936  bool final_tried = false;
5937  int ret;
5938 
5939  data = btrfs_get_alloc_profile(root, data);
5940 again:
5941  WARN_ON(num_bytes < root->sectorsize);
5942  ret = find_free_extent(trans, root, num_bytes, empty_size,
5943  hint_byte, ins, data);
5944 
5945  if (ret == -ENOSPC) {
5946  if (!final_tried) {
5947  num_bytes = num_bytes >> 1;
5948  num_bytes = num_bytes & ~(root->sectorsize - 1);
5949  num_bytes = max(num_bytes, min_alloc_size);
5950  if (num_bytes == min_alloc_size)
5951  final_tried = true;
5952  goto again;
5953  } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
5954  struct btrfs_space_info *sinfo;
5955 
5956  sinfo = __find_space_info(root->fs_info, data);
5957  printk(KERN_ERR "btrfs allocation failed flags %llu, "
5958  "wanted %llu\n", (unsigned long long)data,
5959  (unsigned long long)num_bytes);
5960  if (sinfo)
5961  dump_space_info(sinfo, num_bytes, 1);
5962  }
5963  }
5964 
5965  trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5966 
5967  return ret;
5968 }
5969 
5970 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5971  u64 start, u64 len, int pin)
5972 {
5974  int ret = 0;
5975 
5976  cache = btrfs_lookup_block_group(root->fs_info, start);
5977  if (!cache) {
5978  printk(KERN_ERR "Unable to find block group for %llu\n",
5979  (unsigned long long)start);
5980  return -ENOSPC;
5981  }
5982 
5983  if (btrfs_test_opt(root, DISCARD))
5984  ret = btrfs_discard_extent(root, start, len, NULL);
5985 
5986  if (pin)
5987  pin_down_extent(root, cache, start, len, 1);
5988  else {
5989  btrfs_add_free_space(cache, start, len);
5990  btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5991  }
5992  btrfs_put_block_group(cache);
5993 
5994  trace_btrfs_reserved_extent_free(root, start, len);
5995 
5996  return ret;
5997 }
5998 
6000  u64 start, u64 len)
6001 {
6002  return __btrfs_free_reserved_extent(root, start, len, 0);
6003 }
6004 
6006  u64 start, u64 len)
6007 {
6008  return __btrfs_free_reserved_extent(root, start, len, 1);
6009 }
6010 
6011 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6012  struct btrfs_root *root,
6013  u64 parent, u64 root_objectid,
6014  u64 flags, u64 owner, u64 offset,
6015  struct btrfs_key *ins, int ref_mod)
6016 {
6017  int ret;
6018  struct btrfs_fs_info *fs_info = root->fs_info;
6019  struct btrfs_extent_item *extent_item;
6020  struct btrfs_extent_inline_ref *iref;
6021  struct btrfs_path *path;
6022  struct extent_buffer *leaf;
6023  int type;
6024  u32 size;
6025 
6026  if (parent > 0)
6028  else
6030 
6031  size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6032 
6033  path = btrfs_alloc_path();
6034  if (!path)
6035  return -ENOMEM;
6036 
6037  path->leave_spinning = 1;
6038  ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6039  ins, size);
6040  if (ret) {
6041  btrfs_free_path(path);
6042  return ret;
6043  }
6044 
6045  leaf = path->nodes[0];
6046  extent_item = btrfs_item_ptr(leaf, path->slots[0],
6047  struct btrfs_extent_item);
6048  btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6049  btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6050  btrfs_set_extent_flags(leaf, extent_item,
6051  flags | BTRFS_EXTENT_FLAG_DATA);
6052 
6053  iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6054  btrfs_set_extent_inline_ref_type(leaf, iref, type);
6055  if (parent > 0) {
6056  struct btrfs_shared_data_ref *ref;
6057  ref = (struct btrfs_shared_data_ref *)(iref + 1);
6058  btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6059  btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6060  } else {
6061  struct btrfs_extent_data_ref *ref;
6062  ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6063  btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6064  btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6065  btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6066  btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6067  }
6068 
6069  btrfs_mark_buffer_dirty(path->nodes[0]);
6070  btrfs_free_path(path);
6071 
6072  ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6073  if (ret) { /* -ENOENT, logic error */
6074  printk(KERN_ERR "btrfs update block group failed for %llu "
6075  "%llu\n", (unsigned long long)ins->objectid,
6076  (unsigned long long)ins->offset);
6077  BUG();
6078  }
6079  return ret;
6080 }
6081 
6082 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6083  struct btrfs_root *root,
6084  u64 parent, u64 root_objectid,
6085  u64 flags, struct btrfs_disk_key *key,
6086  int level, struct btrfs_key *ins)
6087 {
6088  int ret;
6089  struct btrfs_fs_info *fs_info = root->fs_info;
6090  struct btrfs_extent_item *extent_item;
6091  struct btrfs_tree_block_info *block_info;
6092  struct btrfs_extent_inline_ref *iref;
6093  struct btrfs_path *path;
6094  struct extent_buffer *leaf;
6095  u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6096 
6097  path = btrfs_alloc_path();
6098  if (!path)
6099  return -ENOMEM;
6100 
6101  path->leave_spinning = 1;
6102  ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6103  ins, size);
6104  if (ret) {
6105  btrfs_free_path(path);
6106  return ret;
6107  }
6108 
6109  leaf = path->nodes[0];
6110  extent_item = btrfs_item_ptr(leaf, path->slots[0],
6111  struct btrfs_extent_item);
6112  btrfs_set_extent_refs(leaf, extent_item, 1);
6113  btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6114  btrfs_set_extent_flags(leaf, extent_item,
6115  flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6116  block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6117 
6118  btrfs_set_tree_block_key(leaf, block_info, key);
6119  btrfs_set_tree_block_level(leaf, block_info, level);
6120 
6121  iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6122  if (parent > 0) {
6124  btrfs_set_extent_inline_ref_type(leaf, iref,
6126  btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6127  } else {
6128  btrfs_set_extent_inline_ref_type(leaf, iref,
6130  btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6131  }
6132 
6134  btrfs_free_path(path);
6135 
6136  ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6137  if (ret) { /* -ENOENT, logic error */
6138  printk(KERN_ERR "btrfs update block group failed for %llu "
6139  "%llu\n", (unsigned long long)ins->objectid,
6140  (unsigned long long)ins->offset);
6141  BUG();
6142  }
6143  return ret;
6144 }
6145 
6147  struct btrfs_root *root,
6148  u64 root_objectid, u64 owner,
6149  u64 offset, struct btrfs_key *ins)
6150 {
6151  int ret;
6152 
6153  BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6154 
6155  ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6156  ins->offset, 0,
6157  root_objectid, owner, offset,
6159  return ret;
6160 }
6161 
6162 /*
6163  * this is used by the tree logging recovery code. It records that
6164  * an extent has been allocated and makes sure to clear the free
6165  * space cache bits as well
6166  */
6168  struct btrfs_root *root,
6169  u64 root_objectid, u64 owner, u64 offset,
6170  struct btrfs_key *ins)
6171 {
6172  int ret;
6174  struct btrfs_caching_control *caching_ctl;
6175  u64 start = ins->objectid;
6176  u64 num_bytes = ins->offset;
6177 
6178  block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6179  cache_block_group(block_group, trans, NULL, 0);
6180  caching_ctl = get_caching_control(block_group);
6181 
6182  if (!caching_ctl) {
6183  BUG_ON(!block_group_cache_done(block_group));
6184  ret = btrfs_remove_free_space(block_group, start, num_bytes);
6185  BUG_ON(ret); /* -ENOMEM */
6186  } else {
6187  mutex_lock(&caching_ctl->mutex);
6188 
6189  if (start >= caching_ctl->progress) {
6190  ret = add_excluded_extent(root, start, num_bytes);
6191  BUG_ON(ret); /* -ENOMEM */
6192  } else if (start + num_bytes <= caching_ctl->progress) {
6193  ret = btrfs_remove_free_space(block_group,
6194  start, num_bytes);
6195  BUG_ON(ret); /* -ENOMEM */
6196  } else {
6197  num_bytes = caching_ctl->progress - start;
6198  ret = btrfs_remove_free_space(block_group,
6199  start, num_bytes);
6200  BUG_ON(ret); /* -ENOMEM */
6201 
6202  start = caching_ctl->progress;
6203  num_bytes = ins->objectid + ins->offset -
6204  caching_ctl->progress;
6205  ret = add_excluded_extent(root, start, num_bytes);
6206  BUG_ON(ret); /* -ENOMEM */
6207  }
6208 
6209  mutex_unlock(&caching_ctl->mutex);
6210  put_caching_control(caching_ctl);
6211  }
6212 
6213  ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6215  BUG_ON(ret); /* logic error */
6216  btrfs_put_block_group(block_group);
6217  ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6218  0, owner, offset, ins, 1);
6219  return ret;
6220 }
6221 
6223  struct btrfs_root *root,
6224  u64 bytenr, u32 blocksize,
6225  int level)
6226 {
6227  struct extent_buffer *buf;
6228 
6229  buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6230  if (!buf)
6231  return ERR_PTR(-ENOMEM);
6232  btrfs_set_header_generation(buf, trans->transid);
6233  btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6234  btrfs_tree_lock(buf);
6235  clean_tree_block(trans, root, buf);
6237 
6238  btrfs_set_lock_blocking(buf);
6240 
6241  if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6242  /*
6243  * we allow two log transactions at a time, use different
6244  * EXENT bit to differentiate dirty pages.
6245  */
6246  if (root->log_transid % 2 == 0)
6247  set_extent_dirty(&root->dirty_log_pages, buf->start,
6248  buf->start + buf->len - 1, GFP_NOFS);
6249  else
6250  set_extent_new(&root->dirty_log_pages, buf->start,
6251  buf->start + buf->len - 1, GFP_NOFS);
6252  } else {
6253  set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6254  buf->start + buf->len - 1, GFP_NOFS);
6255  }
6256  trans->blocks_used++;
6257  /* this returns a buffer locked for blocking */
6258  return buf;
6259 }
6260 
6261 static struct btrfs_block_rsv *
6262 use_block_rsv(struct btrfs_trans_handle *trans,
6263  struct btrfs_root *root, u32 blocksize)
6264 {
6265  struct btrfs_block_rsv *block_rsv;
6266  struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6267  int ret;
6268 
6269  block_rsv = get_block_rsv(trans, root);
6270 
6271  if (block_rsv->size == 0) {
6272  ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6273  /*
6274  * If we couldn't reserve metadata bytes try and use some from
6275  * the global reserve.
6276  */
6277  if (ret && block_rsv != global_rsv) {
6278  ret = block_rsv_use_bytes(global_rsv, blocksize);
6279  if (!ret)
6280  return global_rsv;
6281  return ERR_PTR(ret);
6282  } else if (ret) {
6283  return ERR_PTR(ret);
6284  }
6285  return block_rsv;
6286  }
6287 
6288  ret = block_rsv_use_bytes(block_rsv, blocksize);
6289  if (!ret)
6290  return block_rsv;
6291  if (ret && !block_rsv->failfast) {
6292  static DEFINE_RATELIMIT_STATE(_rs,
6294  /*DEFAULT_RATELIMIT_BURST*/ 2);
6295  if (__ratelimit(&_rs)) {
6296  printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
6297  WARN_ON(1);
6298  }
6299  ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6300  if (!ret) {
6301  return block_rsv;
6302  } else if (ret && block_rsv != global_rsv) {
6303  ret = block_rsv_use_bytes(global_rsv, blocksize);
6304  if (!ret)
6305  return global_rsv;
6306  }
6307  }
6308 
6309  return ERR_PTR(-ENOSPC);
6310 }
6311 
6312 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6313  struct btrfs_block_rsv *block_rsv, u32 blocksize)
6314 {
6315  block_rsv_add_bytes(block_rsv, blocksize, 0);
6316  block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6317 }
6318 
6319 /*
6320  * finds a free extent and does all the dirty work required for allocation
6321  * returns the key for the extent through ins, and a tree buffer for
6322  * the first block of the extent through buf.
6323  *
6324  * returns the tree buffer or NULL.
6325  */
6327  struct btrfs_root *root, u32 blocksize,
6328  u64 parent, u64 root_objectid,
6329  struct btrfs_disk_key *key, int level,
6330  u64 hint, u64 empty_size)
6331 {
6332  struct btrfs_key ins;
6333  struct btrfs_block_rsv *block_rsv;
6334  struct extent_buffer *buf;
6335  u64 flags = 0;
6336  int ret;
6337 
6338 
6339  block_rsv = use_block_rsv(trans, root, blocksize);
6340  if (IS_ERR(block_rsv))
6341  return ERR_CAST(block_rsv);
6342 
6343  ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6344  empty_size, hint, &ins, 0);
6345  if (ret) {
6346  unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6347  return ERR_PTR(ret);
6348  }
6349 
6350  buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6351  blocksize, level);
6352  BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6353 
6354  if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6355  if (parent == 0)
6356  parent = ins.objectid;
6358  } else
6359  BUG_ON(parent > 0);
6360 
6361  if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6362  struct btrfs_delayed_extent_op *extent_op;
6363  extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
6364  BUG_ON(!extent_op); /* -ENOMEM */
6365  if (key)
6366  memcpy(&extent_op->key, key, sizeof(extent_op->key));
6367  else
6368  memset(&extent_op->key, 0, sizeof(extent_op->key));
6369  extent_op->flags_to_set = flags;
6370  extent_op->update_key = 1;
6371  extent_op->update_flags = 1;
6372  extent_op->is_data = 0;
6373 
6374  ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6375  ins.objectid,
6376  ins.offset, parent, root_objectid,
6377  level, BTRFS_ADD_DELAYED_EXTENT,
6378  extent_op, 0);
6379  BUG_ON(ret); /* -ENOMEM */
6380  }
6381  return buf;
6382 }
6383 
6385  u64 refs[BTRFS_MAX_LEVEL];
6386  u64 flags[BTRFS_MAX_LEVEL];
6388  int stage;
6389  int level;
6396 };
6397 
6398 #define DROP_REFERENCE 1
6399 #define UPDATE_BACKREF 2
6400 
6401 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6402  struct btrfs_root *root,
6403  struct walk_control *wc,
6404  struct btrfs_path *path)
6405 {
6406  u64 bytenr;
6407  u64 generation;
6408  u64 refs;
6409  u64 flags;
6410  u32 nritems;
6411  u32 blocksize;
6412  struct btrfs_key key;
6413  struct extent_buffer *eb;
6414  int ret;
6415  int slot;
6416  int nread = 0;
6417 
6418  if (path->slots[wc->level] < wc->reada_slot) {
6419  wc->reada_count = wc->reada_count * 2 / 3;
6420  wc->reada_count = max(wc->reada_count, 2);
6421  } else {
6422  wc->reada_count = wc->reada_count * 3 / 2;
6423  wc->reada_count = min_t(int, wc->reada_count,
6424  BTRFS_NODEPTRS_PER_BLOCK(root));
6425  }
6426 
6427  eb = path->nodes[wc->level];
6428  nritems = btrfs_header_nritems(eb);
6429  blocksize = btrfs_level_size(root, wc->level - 1);
6430 
6431  for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6432  if (nread >= wc->reada_count)
6433  break;
6434 
6435  cond_resched();
6436  bytenr = btrfs_node_blockptr(eb, slot);
6437  generation = btrfs_node_ptr_generation(eb, slot);
6438 
6439  if (slot == path->slots[wc->level])
6440  goto reada;
6441 
6442  if (wc->stage == UPDATE_BACKREF &&
6443  generation <= root->root_key.offset)
6444  continue;
6445 
6446  /* We don't lock the tree block, it's OK to be racy here */
6447  ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6448  &refs, &flags);
6449  /* We don't care about errors in readahead. */
6450  if (ret < 0)
6451  continue;
6452  BUG_ON(refs == 0);
6453 
6454  if (wc->stage == DROP_REFERENCE) {
6455  if (refs == 1)
6456  goto reada;
6457 
6458  if (wc->level == 1 &&
6460  continue;
6461  if (!wc->update_ref ||
6462  generation <= root->root_key.offset)
6463  continue;
6464  btrfs_node_key_to_cpu(eb, &key, slot);
6465  ret = btrfs_comp_cpu_keys(&key,
6466  &wc->update_progress);
6467  if (ret < 0)
6468  continue;
6469  } else {
6470  if (wc->level == 1 &&
6472  continue;
6473  }
6474 reada:
6475  ret = readahead_tree_block(root, bytenr, blocksize,
6476  generation);
6477  if (ret)
6478  break;
6479  nread++;
6480  }
6481  wc->reada_slot = slot;
6482 }
6483 
6484 /*
6485  * hepler to process tree block while walking down the tree.
6486  *
6487  * when wc->stage == UPDATE_BACKREF, this function updates
6488  * back refs for pointers in the block.
6489  *
6490  * NOTE: return value 1 means we should stop walking down.
6491  */
6492 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6493  struct btrfs_root *root,
6494  struct btrfs_path *path,
6495  struct walk_control *wc, int lookup_info)
6496 {
6497  int level = wc->level;
6498  struct extent_buffer *eb = path->nodes[level];
6500  int ret;
6501 
6502  if (wc->stage == UPDATE_BACKREF &&
6503  btrfs_header_owner(eb) != root->root_key.objectid)
6504  return 1;
6505 
6506  /*
6507  * when reference count of tree block is 1, it won't increase
6508  * again. once full backref flag is set, we never clear it.
6509  */
6510  if (lookup_info &&
6511  ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6512  (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6513  BUG_ON(!path->locks[level]);
6514  ret = btrfs_lookup_extent_info(trans, root,
6515  eb->start, eb->len,
6516  &wc->refs[level],
6517  &wc->flags[level]);
6518  BUG_ON(ret == -ENOMEM);
6519  if (ret)
6520  return ret;
6521  BUG_ON(wc->refs[level] == 0);
6522  }
6523 
6524  if (wc->stage == DROP_REFERENCE) {
6525  if (wc->refs[level] > 1)
6526  return 1;
6527 
6528  if (path->locks[level] && !wc->keep_locks) {
6529  btrfs_tree_unlock_rw(eb, path->locks[level]);
6530  path->locks[level] = 0;
6531  }
6532  return 0;
6533  }
6534 
6535  /* wc->stage == UPDATE_BACKREF */
6536  if (!(wc->flags[level] & flag)) {
6537  BUG_ON(!path->locks[level]);
6538  ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6539  BUG_ON(ret); /* -ENOMEM */
6540  ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6541  BUG_ON(ret); /* -ENOMEM */
6542  ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6543  eb->len, flag, 0);
6544  BUG_ON(ret); /* -ENOMEM */
6545  wc->flags[level] |= flag;
6546  }
6547 
6548  /*
6549  * the block is shared by multiple trees, so it's not good to
6550  * keep the tree lock
6551  */
6552  if (path->locks[level] && level > 0) {
6553  btrfs_tree_unlock_rw(eb, path->locks[level]);
6554  path->locks[level] = 0;
6555  }
6556  return 0;
6557 }
6558 
6559 /*
6560  * hepler to process tree block pointer.
6561  *
6562  * when wc->stage == DROP_REFERENCE, this function checks
6563  * reference count of the block pointed to. if the block
6564  * is shared and we need update back refs for the subtree
6565  * rooted at the block, this function changes wc->stage to
6566  * UPDATE_BACKREF. if the block is shared and there is no
6567  * need to update back, this function drops the reference
6568  * to the block.
6569  *
6570  * NOTE: return value 1 means we should stop walking down.
6571  */
6572 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6573  struct btrfs_root *root,
6574  struct btrfs_path *path,
6575  struct walk_control *wc, int *lookup_info)
6576 {
6577  u64 bytenr;
6578  u64 generation;
6579  u64 parent;
6580  u32 blocksize;
6581  struct btrfs_key key;
6582  struct extent_buffer *next;
6583  int level = wc->level;
6584  int reada = 0;
6585  int ret = 0;
6586 
6587  generation = btrfs_node_ptr_generation(path->nodes[level],
6588  path->slots[level]);
6589  /*
6590  * if the lower level block was created before the snapshot
6591  * was created, we know there is no need to update back refs
6592  * for the subtree
6593  */
6594  if (wc->stage == UPDATE_BACKREF &&
6595  generation <= root->root_key.offset) {
6596  *lookup_info = 1;
6597  return 1;
6598  }
6599 
6600  bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6601  blocksize = btrfs_level_size(root, level - 1);
6602 
6603  next = btrfs_find_tree_block(root, bytenr, blocksize);
6604  if (!next) {
6605  next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6606  if (!next)
6607  return -ENOMEM;
6608  reada = 1;
6609  }
6610  btrfs_tree_lock(next);
6611  btrfs_set_lock_blocking(next);
6612 
6613  ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6614  &wc->refs[level - 1],
6615  &wc->flags[level - 1]);
6616  if (ret < 0) {
6617  btrfs_tree_unlock(next);
6618  return ret;
6619  }
6620 
6621  BUG_ON(wc->refs[level - 1] == 0);
6622  *lookup_info = 0;
6623 
6624  if (wc->stage == DROP_REFERENCE) {
6625  if (wc->refs[level - 1] > 1) {
6626  if (level == 1 &&
6628  goto skip;
6629 
6630  if (!wc->update_ref ||
6631  generation <= root->root_key.offset)
6632  goto skip;
6633 
6634  btrfs_node_key_to_cpu(path->nodes[level], &key,
6635  path->slots[level]);
6636  ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6637  if (ret < 0)
6638  goto skip;
6639 
6640  wc->stage = UPDATE_BACKREF;
6641  wc->shared_level = level - 1;
6642  }
6643  } else {
6644  if (level == 1 &&
6646  goto skip;
6647  }
6648 
6649  if (!btrfs_buffer_uptodate(next, generation, 0)) {
6650  btrfs_tree_unlock(next);
6651  free_extent_buffer(next);
6652  next = NULL;
6653  *lookup_info = 1;
6654  }
6655 
6656  if (!next) {
6657  if (reada && level == 1)
6658  reada_walk_down(trans, root, wc, path);
6659  next = read_tree_block(root, bytenr, blocksize, generation);
6660  if (!next)
6661  return -EIO;
6662  btrfs_tree_lock(next);
6663  btrfs_set_lock_blocking(next);
6664  }
6665 
6666  level--;
6667  BUG_ON(level != btrfs_header_level(next));
6668  path->nodes[level] = next;
6669  path->slots[level] = 0;
6671  wc->level = level;
6672  if (wc->level == 1)
6673  wc->reada_slot = 0;
6674  return 0;
6675 skip:
6676  wc->refs[level - 1] = 0;
6677  wc->flags[level - 1] = 0;
6678  if (wc->stage == DROP_REFERENCE) {
6679  if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6680  parent = path->nodes[level]->start;
6681  } else {
6682  BUG_ON(root->root_key.objectid !=
6683  btrfs_header_owner(path->nodes[level]));
6684  parent = 0;
6685  }
6686 
6687  ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6688  root->root_key.objectid, level - 1, 0, 0);
6689  BUG_ON(ret); /* -ENOMEM */
6690  }
6691  btrfs_tree_unlock(next);
6692  free_extent_buffer(next);
6693  *lookup_info = 1;
6694  return 1;
6695 }
6696 
6697 /*
6698  * hepler to process tree block while walking up the tree.
6699  *
6700  * when wc->stage == DROP_REFERENCE, this function drops
6701  * reference count on the block.
6702  *
6703  * when wc->stage == UPDATE_BACKREF, this function changes
6704  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6705  * to UPDATE_BACKREF previously while processing the block.
6706  *
6707  * NOTE: return value 1 means we should stop walking up.
6708  */
6709 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6710  struct btrfs_root *root,
6711  struct btrfs_path *path,
6712  struct walk_control *wc)
6713 {
6714  int ret;
6715  int level = wc->level;
6716  struct extent_buffer *eb = path->nodes[level];
6717  u64 parent = 0;
6718 
6719  if (wc->stage == UPDATE_BACKREF) {
6720  BUG_ON(wc->shared_level < level);
6721  if (level < wc->shared_level)
6722  goto out;
6723 
6724  ret = find_next_key(path, level + 1, &wc->update_progress);
6725  if (ret > 0)
6726  wc->update_ref = 0;
6727 
6728  wc->stage = DROP_REFERENCE;
6729  wc->shared_level = -1;
6730  path->slots[level] = 0;
6731 
6732  /*
6733  * check reference count again if the block isn't locked.
6734  * we should start walking down the tree again if reference
6735  * count is one.
6736  */
6737  if (!path->locks[level]) {
6738  BUG_ON(level == 0);
6739  btrfs_tree_lock(eb);
6740  btrfs_set_lock_blocking(eb);
6742 
6743  ret = btrfs_lookup_extent_info(trans, root,
6744  eb->start, eb->len,
6745  &wc->refs[level],
6746  &wc->flags[level]);
6747  if (ret < 0) {
6748  btrfs_tree_unlock_rw(eb, path->locks[level]);
6749  return ret;
6750  }
6751  BUG_ON(wc->refs[level] == 0);
6752  if (wc->refs[level] == 1) {
6753  btrfs_tree_unlock_rw(eb, path->locks[level]);
6754  return 1;
6755  }
6756  }
6757  }
6758 
6759  /* wc->stage == DROP_REFERENCE */
6760  BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6761 
6762  if (wc->refs[level] == 1) {
6763  if (level == 0) {
6764  if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6765  ret = btrfs_dec_ref(trans, root, eb, 1,
6766  wc->for_reloc);
6767  else
6768  ret = btrfs_dec_ref(trans, root, eb, 0,
6769  wc->for_reloc);
6770  BUG_ON(ret); /* -ENOMEM */
6771  }
6772  /* make block locked assertion in clean_tree_block happy */
6773  if (!path->locks[level] &&
6774  btrfs_header_generation(eb) == trans->transid) {
6775  btrfs_tree_lock(eb);
6776  btrfs_set_lock_blocking(eb);
6778  }
6779  clean_tree_block(trans, root, eb);
6780  }
6781 
6782  if (eb == root->node) {
6783  if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6784  parent = eb->start;
6785  else
6786  BUG_ON(root->root_key.objectid !=
6787  btrfs_header_owner(eb));
6788  } else {
6789  if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6790  parent = path->nodes[level + 1]->start;
6791  else
6792  BUG_ON(root->root_key.objectid !=
6793  btrfs_header_owner(path->nodes[level + 1]));
6794  }
6795 
6796  btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6797 out:
6798  wc->refs[level] = 0;
6799  wc->flags[level] = 0;
6800  return 0;
6801 }
6802 
6803 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6804  struct btrfs_root *root,
6805  struct btrfs_path *path,
6806  struct walk_control *wc)
6807 {
6808  int level = wc->level;
6809  int lookup_info = 1;
6810  int ret;
6811 
6812  while (level >= 0) {
6813  ret = walk_down_proc(trans, root, path, wc, lookup_info);
6814  if (ret > 0)
6815  break;
6816 
6817  if (level == 0)
6818  break;
6819 
6820  if (path->slots[level] >=
6821  btrfs_header_nritems(path->nodes[level]))
6822  break;
6823 
6824  ret = do_walk_down(trans, root, path, wc, &lookup_info);
6825  if (ret > 0) {
6826  path->slots[level]++;
6827  continue;
6828  } else if (ret < 0)
6829  return ret;
6830  level = wc->level;
6831  }
6832  return 0;
6833 }
6834 
6835 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6836  struct btrfs_root *root,
6837  struct btrfs_path *path,
6838  struct walk_control *wc, int max_level)
6839 {
6840  int level = wc->level;
6841  int ret;
6842 
6843  path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6844  while (level < max_level && path->nodes[level]) {
6845  wc->level = level;
6846  if (path->slots[level] + 1 <
6847  btrfs_header_nritems(path->nodes[level])) {
6848  path->slots[level]++;
6849  return 0;
6850  } else {
6851  ret = walk_up_proc(trans, root, path, wc);
6852  if (ret > 0)
6853  return 0;
6854 
6855  if (path->locks[level]) {
6856  btrfs_tree_unlock_rw(path->nodes[level],
6857  path->locks[level]);
6858  path->locks[level] = 0;
6859  }
6860  free_extent_buffer(path->nodes[level]);
6861  path->nodes[level] = NULL;
6862  level++;
6863  }
6864  }
6865  return 1;
6866 }
6867 
6868 /*
6869  * drop a subvolume tree.
6870  *
6871  * this function traverses the tree freeing any blocks that only
6872  * referenced by the tree.
6873  *
6874  * when a shared tree block is found. this function decreases its
6875  * reference count by one. if update_ref is true, this function
6876  * also make sure backrefs for the shared block and all lower level
6877  * blocks are properly updated.
6878  */
6880  struct btrfs_block_rsv *block_rsv, int update_ref,
6881  int for_reloc)
6882 {
6883  struct btrfs_path *path;
6884  struct btrfs_trans_handle *trans;
6885  struct btrfs_root *tree_root = root->fs_info->tree_root;
6886  struct btrfs_root_item *root_item = &root->root_item;
6887  struct walk_control *wc;
6888  struct btrfs_key key;
6889  int err = 0;
6890  int ret;
6891  int level;
6892 
6893  path = btrfs_alloc_path();
6894  if (!path) {
6895  err = -ENOMEM;
6896  goto out;
6897  }
6898 
6899  wc = kzalloc(sizeof(*wc), GFP_NOFS);
6900  if (!wc) {
6901  btrfs_free_path(path);
6902  err = -ENOMEM;
6903  goto out;
6904  }
6905 
6906  trans = btrfs_start_transaction(tree_root, 0);
6907  if (IS_ERR(trans)) {
6908  err = PTR_ERR(trans);
6909  goto out_free;
6910  }
6911 
6912  if (block_rsv)
6913  trans->block_rsv = block_rsv;
6914 
6915  if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6916  level = btrfs_header_level(root->node);
6917  path->nodes[level] = btrfs_lock_root_node(root);
6918  btrfs_set_lock_blocking(path->nodes[level]);
6919  path->slots[level] = 0;
6921  memset(&wc->update_progress, 0,
6922  sizeof(wc->update_progress));
6923  } else {
6924  btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6925  memcpy(&wc->update_progress, &key,
6926  sizeof(wc->update_progress));
6927 
6928  level = root_item->drop_level;
6929  BUG_ON(level == 0);
6930  path->lowest_level = level;
6931  ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6932  path->lowest_level = 0;
6933  if (ret < 0) {
6934  err = ret;
6935  goto out_end_trans;
6936  }
6937  WARN_ON(ret > 0);
6938 
6939  /*
6940  * unlock our path, this is safe because only this
6941  * function is allowed to delete this snapshot
6942  */
6943  btrfs_unlock_up_safe(path, 0);
6944 
6945  level = btrfs_header_level(root->node);
6946  while (1) {
6947  btrfs_tree_lock(path->nodes[level]);
6948  btrfs_set_lock_blocking(path->nodes[level]);
6949 
6950  ret = btrfs_lookup_extent_info(trans, root,
6951  path->nodes[level]->start,
6952  path->nodes[level]->len,
6953  &wc->refs[level],
6954  &wc->flags[level]);
6955  if (ret < 0) {
6956  err = ret;
6957  goto out_end_trans;
6958  }
6959  BUG_ON(wc->refs[level] == 0);
6960 
6961  if (level == root_item->drop_level)
6962  break;
6963 
6964  btrfs_tree_unlock(path->nodes[level]);
6965  WARN_ON(wc->refs[level] != 1);
6966  level--;
6967  }
6968  }
6969 
6970  wc->level = level;
6971  wc->shared_level = -1;
6972  wc->stage = DROP_REFERENCE;
6973  wc->update_ref = update_ref;
6974  wc->keep_locks = 0;
6975  wc->for_reloc = for_reloc;
6977 
6978  while (1) {
6979  ret = walk_down_tree(trans, root, path, wc);
6980  if (ret < 0) {
6981  err = ret;
6982  break;
6983  }
6984 
6985  ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6986  if (ret < 0) {
6987  err = ret;
6988  break;
6989  }
6990 
6991  if (ret > 0) {
6992  BUG_ON(wc->stage != DROP_REFERENCE);
6993  break;
6994  }
6995 
6996  if (wc->stage == DROP_REFERENCE) {
6997  level = wc->level;
6998  btrfs_node_key(path->nodes[level],
6999  &root_item->drop_progress,
7000  path->slots[level]);
7001  root_item->drop_level = level;
7002  }
7003 
7004  BUG_ON(wc->level == 0);
7005  if (btrfs_should_end_transaction(trans, tree_root)) {
7006  ret = btrfs_update_root(trans, tree_root,
7007  &root->root_key,
7008  root_item);
7009  if (ret) {
7010  btrfs_abort_transaction(trans, tree_root, ret);
7011  err = ret;
7012  goto out_end_trans;
7013  }
7014 
7015  btrfs_end_transaction_throttle(trans, tree_root);
7016  trans = btrfs_start_transaction(tree_root, 0);
7017  if (IS_ERR(trans)) {
7018  err = PTR_ERR(trans);
7019  goto out_free;
7020  }
7021  if (block_rsv)
7022  trans->block_rsv = block_rsv;
7023  }
7024  }
7025  btrfs_release_path(path);
7026  if (err)
7027  goto out_end_trans;
7028 
7029  ret = btrfs_del_root(trans, tree_root, &root->root_key);
7030  if (ret) {
7031  btrfs_abort_transaction(trans, tree_root, ret);
7032  goto out_end_trans;
7033  }
7034 
7035  if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7036  ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7037  NULL, NULL);
7038  if (ret < 0) {
7039  btrfs_abort_transaction(trans, tree_root, ret);
7040  err = ret;
7041  goto out_end_trans;
7042  } else if (ret > 0) {
7043  /* if we fail to delete the orphan item this time
7044  * around, it'll get picked up the next time.
7045  *
7046  * The most common failure here is just -ENOENT.
7047  */
7048  btrfs_del_orphan_item(trans, tree_root,
7049  root->root_key.objectid);
7050  }
7051  }
7052 
7053  if (root->in_radix) {
7054  btrfs_free_fs_root(tree_root->fs_info, root);
7055  } else {
7056  free_extent_buffer(root->node);
7058  kfree(root);
7059  }
7060 out_end_trans:
7061  btrfs_end_transaction_throttle(trans, tree_root);
7062 out_free:
7063  kfree(wc);
7064  btrfs_free_path(path);
7065 out:
7066  if (err)
7067  btrfs_std_error(root->fs_info, err);
7068  return err;
7069 }
7070 
7071 /*
7072  * drop subtree rooted at tree block 'node'.
7073  *
7074  * NOTE: this function will unlock and release tree block 'node'
7075  * only used by relocation code
7076  */
7078  struct btrfs_root *root,
7079  struct extent_buffer *node,
7080  struct extent_buffer *parent)
7081 {
7082  struct btrfs_path *path;
7083  struct walk_control *wc;
7084  int level;
7085  int parent_level;
7086  int ret = 0;
7087  int wret;
7088 
7089  BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7090 
7091  path = btrfs_alloc_path();
7092  if (!path)
7093  return -ENOMEM;
7094 
7095  wc = kzalloc(sizeof(*wc), GFP_NOFS);
7096  if (!wc) {
7097  btrfs_free_path(path);
7098  return -ENOMEM;
7099  }
7100 
7101  btrfs_assert_tree_locked(parent);
7102  parent_level = btrfs_header_level(parent);
7103  extent_buffer_get(parent);
7104  path->nodes[parent_level] = parent;
7105  path->slots[parent_level] = btrfs_header_nritems(parent);
7106 
7108  level = btrfs_header_level(node);
7109  path->nodes[level] = node;
7110  path->slots[level] = 0;
7112 
7113  wc->refs[parent_level] = 1;
7114  wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7115  wc->level = level;
7116  wc->shared_level = -1;
7117  wc->stage = DROP_REFERENCE;
7118  wc->update_ref = 0;
7119  wc->keep_locks = 1;
7120  wc->for_reloc = 1;
7122 
7123  while (1) {
7124  wret = walk_down_tree(trans, root, path, wc);
7125  if (wret < 0) {
7126  ret = wret;
7127  break;
7128  }
7129 
7130  wret = walk_up_tree(trans, root, path, wc, parent_level);
7131  if (wret < 0)
7132  ret = wret;
7133  if (wret != 0)
7134  break;
7135  }
7136 
7137  kfree(wc);
7138  btrfs_free_path(path);
7139  return ret;
7140 }
7141 
7142 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7143 {
7144  u64 num_devices;
7145  u64 stripped;
7146 
7147  /*
7148  * if restripe for this chunk_type is on pick target profile and
7149  * return, otherwise do the usual balance
7150  */
7151  stripped = get_restripe_target(root->fs_info, flags);
7152  if (stripped)
7153  return extended_to_chunk(stripped);
7154 
7155  /*
7156  * we add in the count of missing devices because we want
7157  * to make sure that any RAID levels on a degraded FS
7158  * continue to be honored.
7159  */
7160  num_devices = root->fs_info->fs_devices->rw_devices +
7161  root->fs_info->fs_devices->missing_devices;
7162 
7163  stripped = BTRFS_BLOCK_GROUP_RAID0 |
7164  BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7165 
7166  if (num_devices == 1) {
7167  stripped |= BTRFS_BLOCK_GROUP_DUP;
7168  stripped = flags & ~stripped;
7169 
7170  /* turn raid0 into single device chunks */
7171  if (flags & BTRFS_BLOCK_GROUP_RAID0)
7172  return stripped;
7173 
7174  /* turn mirroring into duplication */
7175  if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7176  BTRFS_BLOCK_GROUP_RAID10))
7177  return stripped | BTRFS_BLOCK_GROUP_DUP;
7178  } else {
7179  /* they already had raid on here, just return */
7180  if (flags & stripped)
7181  return flags;
7182 
7183  stripped |= BTRFS_BLOCK_GROUP_DUP;
7184  stripped = flags & ~stripped;
7185 
7186  /* switch duplicated blocks with raid1 */
7187  if (flags & BTRFS_BLOCK_GROUP_DUP)
7188  return stripped | BTRFS_BLOCK_GROUP_RAID1;
7189 
7190  /* this is drive concat, leave it alone */
7191  }
7192 
7193  return flags;
7194 }
7195 
7196 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7197 {
7198  struct btrfs_space_info *sinfo = cache->space_info;
7199  u64 num_bytes;
7200  u64 min_allocable_bytes;
7201  int ret = -ENOSPC;
7202 
7203 
7204  /*
7205  * We need some metadata space and system metadata space for
7206  * allocating chunks in some corner cases until we force to set
7207  * it to be readonly.
7208  */
7209  if ((sinfo->flags &
7210  (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7211  !force)
7212  min_allocable_bytes = 1 * 1024 * 1024;
7213  else
7214  min_allocable_bytes = 0;
7215 
7216  spin_lock(&sinfo->lock);
7217  spin_lock(&cache->lock);
7218 
7219  if (cache->ro) {
7220  ret = 0;
7221  goto out;
7222  }
7223 
7224  num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7225  cache->bytes_super - btrfs_block_group_used(&cache->item);
7226 
7227  if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7228  sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7229  min_allocable_bytes <= sinfo->total_bytes) {
7230  sinfo->bytes_readonly += num_bytes;
7231  cache->ro = 1;
7232  ret = 0;
7233  }
7234 out:
7235  spin_unlock(&cache->lock);
7236  spin_unlock(&sinfo->lock);
7237  return ret;
7238 }
7239 
7241  struct btrfs_block_group_cache *cache)
7242 
7243 {
7244  struct btrfs_trans_handle *trans;
7245  u64 alloc_flags;
7246  int ret;
7247 
7248  BUG_ON(cache->ro);
7249 
7250  trans = btrfs_join_transaction(root);
7251  if (IS_ERR(trans))
7252  return PTR_ERR(trans);
7253 
7254  alloc_flags = update_block_group_flags(root, cache->flags);
7255  if (alloc_flags != cache->flags) {
7256  ret = do_chunk_alloc(trans, root, alloc_flags,
7258  if (ret < 0)
7259  goto out;
7260  }
7261 
7262  ret = set_block_group_ro(cache, 0);
7263  if (!ret)
7264  goto out;
7265  alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7266  ret = do_chunk_alloc(trans, root, alloc_flags,
7268  if (ret < 0)
7269  goto out;
7270  ret = set_block_group_ro(cache, 0);
7271 out:
7272  btrfs_end_transaction(trans, root);
7273  return ret;
7274 }
7275 
7277  struct btrfs_root *root, u64 type)
7278 {
7279  u64 alloc_flags = get_alloc_profile(root, type);
7280  return do_chunk_alloc(trans, root, alloc_flags,
7282 }
7283 
7284 /*
7285  * helper to account the unused space of all the readonly block group in the
7286  * list. takes mirrors into account.
7287  */
7288 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7289 {
7291  u64 free_bytes = 0;
7292  int factor;
7293 
7294  list_for_each_entry(block_group, groups_list, list) {
7295  spin_lock(&block_group->lock);
7296 
7297  if (!block_group->ro) {
7298  spin_unlock(&block_group->lock);
7299  continue;
7300  }
7301 
7302  if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7303  BTRFS_BLOCK_GROUP_RAID10 |
7304  BTRFS_BLOCK_GROUP_DUP))
7305  factor = 2;
7306  else
7307  factor = 1;
7308 
7309  free_bytes += (block_group->key.offset -
7310  btrfs_block_group_used(&block_group->item)) *
7311  factor;
7312 
7313  spin_unlock(&block_group->lock);
7314  }
7315 
7316  return free_bytes;
7317 }
7318 
7319 /*
7320  * helper to account the unused space of all the readonly block group in the
7321  * space_info. takes mirrors into account.
7322  */
7324 {
7325  int i;
7326  u64 free_bytes = 0;
7327 
7328  spin_lock(&sinfo->lock);
7329 
7330  for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7331  if (!list_empty(&sinfo->block_groups[i]))
7332  free_bytes += __btrfs_get_ro_block_group_free_space(
7333  &sinfo->block_groups[i]);
7334 
7335  spin_unlock(&sinfo->lock);
7336 
7337  return free_bytes;
7338 }
7339 
7341  struct btrfs_block_group_cache *cache)
7342 {
7343  struct btrfs_space_info *sinfo = cache->space_info;
7344  u64 num_bytes;
7345 
7346  BUG_ON(!cache->ro);
7347 
7348  spin_lock(&sinfo->lock);
7349  spin_lock(&cache->lock);
7350  num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7351  cache->bytes_super - btrfs_block_group_used(&cache->item);
7352  sinfo->bytes_readonly -= num_bytes;
7353  cache->ro = 0;
7354  spin_unlock(&cache->lock);
7355  spin_unlock(&sinfo->lock);
7356 }
7357 
7358 /*
7359  * checks to see if its even possible to relocate this block group.
7360  *
7361  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7362  * ok to go ahead and try.
7363  */
7364 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7365 {
7367  struct btrfs_space_info *space_info;
7368  struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7369  struct btrfs_device *device;
7370  u64 min_free;
7371  u64 dev_min = 1;
7372  u64 dev_nr = 0;
7373  u64 target;
7374  int index;
7375  int full = 0;
7376  int ret = 0;
7377 
7378  block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7379 
7380  /* odd, couldn't find the block group, leave it alone */
7381  if (!block_group)
7382  return -1;
7383 
7384  min_free = btrfs_block_group_used(&block_group->item);
7385 
7386  /* no bytes used, we're good */
7387  if (!min_free)
7388  goto out;
7389 
7390  space_info = block_group->space_info;
7391  spin_lock(&space_info->lock);
7392 
7393  full = space_info->full;
7394 
7395  /*
7396  * if this is the last block group we have in this space, we can't
7397  * relocate it unless we're able to allocate a new chunk below.
7398  *
7399  * Otherwise, we need to make sure we have room in the space to handle
7400  * all of the extents from this block group. If we can, we're good
7401  */
7402  if ((space_info->total_bytes != block_group->key.offset) &&
7403  (space_info->bytes_used + space_info->bytes_reserved +
7404  space_info->bytes_pinned + space_info->bytes_readonly +
7405  min_free < space_info->total_bytes)) {
7406  spin_unlock(&space_info->lock);
7407  goto out;
7408  }
7409  spin_unlock(&space_info->lock);
7410 
7411  /*
7412  * ok we don't have enough space, but maybe we have free space on our
7413  * devices to allocate new chunks for relocation, so loop through our
7414  * alloc devices and guess if we have enough space. if this block
7415  * group is going to be restriped, run checks against the target
7416  * profile instead of the current one.
7417  */
7418  ret = -1;
7419 
7420  /*
7421  * index:
7422  * 0: raid10
7423  * 1: raid1
7424  * 2: dup
7425  * 3: raid0
7426  * 4: single
7427  */
7428  target = get_restripe_target(root->fs_info, block_group->flags);
7429  if (target) {
7430  index = __get_block_group_index(extended_to_chunk(target));
7431  } else {
7432  /*
7433  * this is just a balance, so if we were marked as full
7434  * we know there is no space for a new chunk
7435  */
7436  if (full)
7437  goto out;
7438 
7439  index = get_block_group_index(block_group);
7440  }
7441 
7442  if (index == 0) {
7443  dev_min = 4;
7444  /* Divide by 2 */
7445  min_free >>= 1;
7446  } else if (index == 1) {
7447  dev_min = 2;
7448  } else if (index == 2) {
7449  /* Multiply by 2 */
7450  min_free <<= 1;
7451  } else if (index == 3) {
7452  dev_min = fs_devices->rw_devices;
7453  do_div(min_free, dev_min);
7454  }
7455 
7456  mutex_lock(&root->fs_info->chunk_mutex);
7457  list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7458  u64 dev_offset;
7459 
7460  /*
7461  * check to make sure we can actually find a chunk with enough
7462  * space to fit our block group in.
7463  */
7464  if (device->total_bytes > device->bytes_used + min_free) {
7465  ret = find_free_dev_extent(device, min_free,
7466  &dev_offset, NULL);
7467  if (!ret)
7468  dev_nr++;
7469 
7470  if (dev_nr >= dev_min)
7471  break;
7472 
7473  ret = -1;
7474  }
7475  }
7476  mutex_unlock(&root->fs_info->chunk_mutex);
7477 out:
7478  btrfs_put_block_group(block_group);
7479  return ret;
7480 }
7481 
7482 static int find_first_block_group(struct btrfs_root *root,
7483  struct btrfs_path *path, struct btrfs_key *key)
7484 {
7485  int ret = 0;
7486  struct btrfs_key found_key;
7487  struct extent_buffer *leaf;
7488  int slot;
7489 
7490  ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7491  if (ret < 0)
7492  goto out;
7493 
7494  while (1) {
7495  slot = path->slots[0];
7496  leaf = path->nodes[0];
7497  if (slot >= btrfs_header_nritems(leaf)) {
7498  ret = btrfs_next_leaf(root, path);
7499  if (ret == 0)
7500  continue;
7501  if (ret < 0)
7502  goto out;
7503  break;
7504  }
7505  btrfs_item_key_to_cpu(leaf, &found_key, slot);
7506 
7507  if (found_key.objectid >= key->objectid &&
7508  found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7509  ret = 0;
7510  goto out;
7511  }
7512  path->slots[0]++;
7513  }
7514 out:
7515  return ret;
7516 }
7517 
7519 {
7521  u64 last = 0;
7522 
7523  while (1) {
7524  struct inode *inode;
7525 
7526  block_group = btrfs_lookup_first_block_group(info, last);
7527  while (block_group) {
7528  spin_lock(&block_group->lock);
7529  if (block_group->iref)
7530  break;
7531  spin_unlock(&block_group->lock);
7532  block_group = next_block_group(info->tree_root,
7533  block_group);
7534  }
7535  if (!block_group) {
7536  if (last == 0)
7537  break;
7538  last = 0;
7539  continue;
7540  }
7541 
7542  inode = block_group->inode;
7543  block_group->iref = 0;
7544  block_group->inode = NULL;
7545  spin_unlock(&block_group->lock);
7546  iput(inode);
7547  last = block_group->key.objectid + block_group->key.offset;
7548  btrfs_put_block_group(block_group);
7549  }
7550 }
7551 
7553 {
7555  struct btrfs_space_info *space_info;
7556  struct btrfs_caching_control *caching_ctl;
7557  struct rb_node *n;
7558 
7559  down_write(&info->extent_commit_sem);
7560  while (!list_empty(&info->caching_block_groups)) {
7561  caching_ctl = list_entry(info->caching_block_groups.next,
7562  struct btrfs_caching_control, list);
7563  list_del(&caching_ctl->list);
7564  put_caching_control(caching_ctl);
7565  }
7566  up_write(&info->extent_commit_sem);
7567 
7568  spin_lock(&info->block_group_cache_lock);
7569  while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7570  block_group = rb_entry(n, struct btrfs_block_group_cache,
7571  cache_node);
7572  rb_erase(&block_group->cache_node,
7573  &info->block_group_cache_tree);
7574  spin_unlock(&info->block_group_cache_lock);
7575 
7576  down_write(&block_group->space_info->groups_sem);
7577  list_del(&block_group->list);
7578  up_write(&block_group->space_info->groups_sem);
7579 
7580  if (block_group->cached == BTRFS_CACHE_STARTED)
7581  wait_block_group_cache_done(block_group);
7582 
7583  /*
7584  * We haven't cached this block group, which means we could
7585  * possibly have excluded extents on this block group.
7586  */
7587  if (block_group->cached == BTRFS_CACHE_NO)
7588  free_excluded_extents(info->extent_root, block_group);
7589 
7590  btrfs_remove_free_space_cache(block_group);
7591  btrfs_put_block_group(block_group);
7592 
7593  spin_lock(&info->block_group_cache_lock);
7594  }
7595  spin_unlock(&info->block_group_cache_lock);
7596 
7597  /* now that all the block groups are freed, go through and
7598  * free all the space_info structs. This is only called during
7599  * the final stages of unmount, and so we know nobody is
7600  * using them. We call synchronize_rcu() once before we start,
7601  * just to be on the safe side.
7602  */
7603  synchronize_rcu();
7604 
7605  release_global_block_rsv(info);
7606 
7607  while(!list_empty(&info->space_info)) {
7608  space_info = list_entry(info->space_info.next,
7609  struct btrfs_space_info,
7610  list);
7611  if (space_info->bytes_pinned > 0 ||
7612  space_info->bytes_reserved > 0 ||
7613  space_info->bytes_may_use > 0) {
7614  WARN_ON(1);
7615  dump_space_info(space_info, 0, 0);
7616  }
7617  list_del(&space_info->list);
7618  kfree(space_info);
7619  }
7620  return 0;
7621 }
7622 
7623 static void __link_block_group(struct btrfs_space_info *space_info,
7624  struct btrfs_block_group_cache *cache)
7625 {
7626  int index = get_block_group_index(cache);
7627 
7628  down_write(&space_info->groups_sem);
7629  list_add_tail(&cache->list, &space_info->block_groups[index]);
7630  up_write(&space_info->groups_sem);
7631 }
7632 
7634 {
7635  struct btrfs_path *path;
7636  int ret;
7638  struct btrfs_fs_info *info = root->fs_info;
7639  struct btrfs_space_info *space_info;
7640  struct btrfs_key key;
7641  struct btrfs_key found_key;
7642  struct extent_buffer *leaf;
7643  int need_clear = 0;
7644  u64 cache_gen;
7645 
7646  root = info->extent_root;
7647  key.objectid = 0;
7648  key.offset = 0;
7649  btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7650  path = btrfs_alloc_path();
7651  if (!path)
7652  return -ENOMEM;
7653  path->reada = 1;
7654 
7655  cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7656  if (btrfs_test_opt(root, SPACE_CACHE) &&
7657  btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7658  need_clear = 1;
7659  if (btrfs_test_opt(root, CLEAR_CACHE))
7660  need_clear = 1;
7661 
7662  while (1) {
7663  ret = find_first_block_group(root, path, &key);
7664  if (ret > 0)
7665  break;
7666  if (ret != 0)
7667  goto error;
7668  leaf = path->nodes[0];
7669  btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7670  cache = kzalloc(sizeof(*cache), GFP_NOFS);
7671  if (!cache) {
7672  ret = -ENOMEM;
7673  goto error;
7674  }
7675  cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7676  GFP_NOFS);
7677  if (!cache->free_space_ctl) {
7678  kfree(cache);
7679  ret = -ENOMEM;
7680  goto error;
7681  }
7682 
7683  atomic_set(&cache->count, 1);
7684  spin_lock_init(&cache->lock);
7685  cache->fs_info = info;
7686  INIT_LIST_HEAD(&cache->list);
7687  INIT_LIST_HEAD(&cache->cluster_list);
7688 
7689  if (need_clear) {
7690  /*
7691  * When we mount with old space cache, we need to
7692  * set BTRFS_DC_CLEAR and set dirty flag.
7693  *
7694  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7695  * truncate the old free space cache inode and
7696  * setup a new one.
7697  * b) Setting 'dirty flag' makes sure that we flush
7698  * the new space cache info onto disk.
7699  */
7701  if (btrfs_test_opt(root, SPACE_CACHE))
7702  cache->dirty = 1;
7703  }
7704 
7705  read_extent_buffer(leaf, &cache->item,
7706  btrfs_item_ptr_offset(leaf, path->slots[0]),
7707  sizeof(cache->item));
7708  memcpy(&cache->key, &found_key, sizeof(found_key));
7709 
7710  key.objectid = found_key.objectid + found_key.offset;
7711  btrfs_release_path(path);
7712  cache->flags = btrfs_block_group_flags(&cache->item);
7713  cache->sectorsize = root->sectorsize;
7714 
7716 
7717  /*
7718  * We need to exclude the super stripes now so that the space
7719  * info has super bytes accounted for, otherwise we'll think
7720  * we have more space than we actually do.
7721  */
7722  exclude_super_stripes(root, cache);
7723 
7724  /*
7725  * check for two cases, either we are full, and therefore
7726  * don't need to bother with the caching work since we won't
7727  * find any space, or we are empty, and we can just add all
7728  * the space in and be done with it. This saves us _alot_ of
7729  * time, particularly in the full case.
7730  */
7731  if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7732  cache->last_byte_to_unpin = (u64)-1;
7733  cache->cached = BTRFS_CACHE_FINISHED;
7734  free_excluded_extents(root, cache);
7735  } else if (btrfs_block_group_used(&cache->item) == 0) {
7736  cache->last_byte_to_unpin = (u64)-1;
7737  cache->cached = BTRFS_CACHE_FINISHED;
7738  add_new_free_space(cache, root->fs_info,
7739  found_key.objectid,
7740  found_key.objectid +
7741  found_key.offset);
7742  free_excluded_extents(root, cache);
7743  }
7744 
7745  ret = update_space_info(info, cache->flags, found_key.offset,
7746  btrfs_block_group_used(&cache->item),
7747  &space_info);
7748  BUG_ON(ret); /* -ENOMEM */
7749  cache->space_info = space_info;
7750  spin_lock(&cache->space_info->lock);
7751  cache->space_info->bytes_readonly += cache->bytes_super;
7752  spin_unlock(&cache->space_info->lock);
7753 
7754  __link_block_group(space_info, cache);
7755 
7756  ret = btrfs_add_block_group_cache(root->fs_info, cache);
7757  BUG_ON(ret); /* Logic error */
7758 
7759  set_avail_alloc_bits(root->fs_info, cache->flags);
7760  if (btrfs_chunk_readonly(root, cache->key.objectid))
7761  set_block_group_ro(cache, 1);
7762  }
7763 
7764  list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7765  if (!(get_alloc_profile(root, space_info->flags) &
7766  (BTRFS_BLOCK_GROUP_RAID10 |
7767  BTRFS_BLOCK_GROUP_RAID1 |
7768  BTRFS_BLOCK_GROUP_DUP)))
7769  continue;
7770  /*
7771  * avoid allocating from un-mirrored block group if there are
7772  * mirrored block groups.
7773  */
7774  list_for_each_entry(cache, &space_info->block_groups[3], list)
7775  set_block_group_ro(cache, 1);
7776  list_for_each_entry(cache, &space_info->block_groups[4], list)
7777  set_block_group_ro(cache, 1);
7778  }
7779 
7780  init_global_block_rsv(info);
7781  ret = 0;
7782 error:
7783  btrfs_free_path(path);
7784  return ret;
7785 }
7786 
7788  struct btrfs_root *root)
7789 {
7791  struct btrfs_root *extent_root = root->fs_info->extent_root;
7792  struct btrfs_block_group_item item;
7793  struct btrfs_key key;
7794  int ret = 0;
7795 
7796  list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7797  new_bg_list) {
7798  list_del_init(&block_group->new_bg_list);
7799 
7800  if (ret)
7801  continue;
7802 
7803  spin_lock(&block_group->lock);
7804  memcpy(&item, &block_group->item, sizeof(item));
7805  memcpy(&key, &block_group->key, sizeof(key));
7806  spin_unlock(&block_group->lock);
7807 
7808  ret = btrfs_insert_item(trans, extent_root, &key, &item,
7809  sizeof(item));
7810  if (ret)
7811  btrfs_abort_transaction(trans, extent_root, ret);
7812  }
7813 }
7814 
7816  struct btrfs_root *root, u64 bytes_used,
7817  u64 type, u64 chunk_objectid, u64 chunk_offset,
7818  u64 size)
7819 {
7820  int ret;
7821  struct btrfs_root *extent_root;
7823 
7824  extent_root = root->fs_info->extent_root;
7825 
7826  root->fs_info->last_trans_log_full_commit = trans->transid;
7827 
7828  cache = kzalloc(sizeof(*cache), GFP_NOFS);
7829  if (!cache)
7830  return -ENOMEM;
7831  cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7832  GFP_NOFS);
7833  if (!cache->free_space_ctl) {
7834  kfree(cache);
7835  return -ENOMEM;
7836  }
7837 
7838  cache->key.objectid = chunk_offset;
7839  cache->key.offset = size;
7840  cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7841  cache->sectorsize = root->sectorsize;
7842  cache->fs_info = root->fs_info;
7843 
7844  atomic_set(&cache->count, 1);
7845  spin_lock_init(&cache->lock);
7846  INIT_LIST_HEAD(&cache->list);
7847  INIT_LIST_HEAD(&cache->cluster_list);
7848  INIT_LIST_HEAD(&cache->new_bg_list);
7849 
7851 
7852  btrfs_set_block_group_used(&cache->item, bytes_used);
7853  btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7854  cache->flags = type;
7855  btrfs_set_block_group_flags(&cache->item, type);
7856 
7857  cache->last_byte_to_unpin = (u64)-1;
7858  cache->cached = BTRFS_CACHE_FINISHED;
7859  exclude_super_stripes(root, cache);
7860 
7861  add_new_free_space(cache, root->fs_info, chunk_offset,
7862  chunk_offset + size);
7863 
7864  free_excluded_extents(root, cache);
7865 
7866  ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7867  &cache->space_info);
7868  BUG_ON(ret); /* -ENOMEM */
7869  update_global_block_rsv(root->fs_info);
7870 
7871  spin_lock(&cache->space_info->lock);
7872  cache->space_info->bytes_readonly += cache->bytes_super;
7873  spin_unlock(&cache->space_info->lock);
7874 
7875  __link_block_group(cache->space_info, cache);
7876 
7877  ret = btrfs_add_block_group_cache(root->fs_info, cache);
7878  BUG_ON(ret); /* Logic error */
7879 
7880  list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7881 
7882  set_avail_alloc_bits(extent_root->fs_info, type);
7883 
7884  return 0;
7885 }
7886 
7887 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7888 {
7889  u64 extra_flags = chunk_to_extended(flags) &
7891 
7892  if (flags & BTRFS_BLOCK_GROUP_DATA)
7893  fs_info->avail_data_alloc_bits &= ~extra_flags;
7894  if (flags & BTRFS_BLOCK_GROUP_METADATA)
7895  fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7896  if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7897  fs_info->avail_system_alloc_bits &= ~extra_flags;
7898 }
7899 
7901  struct btrfs_root *root, u64 group_start)
7902 {
7903  struct btrfs_path *path;
7905  struct btrfs_free_cluster *cluster;
7906  struct btrfs_root *tree_root = root->fs_info->tree_root;
7907  struct btrfs_key key;
7908  struct inode *inode;
7909  int ret;
7910  int index;
7911  int factor;
7912 
7913  root = root->fs_info->extent_root;
7914 
7915  block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7916  BUG_ON(!block_group);
7917  BUG_ON(!block_group->ro);
7918 
7919  /*
7920  * Free the reserved super bytes from this block group before
7921  * remove it.
7922  */
7923  free_excluded_extents(root, block_group);
7924 
7925  memcpy(&key, &block_group->key, sizeof(key));
7926  index = get_block_group_index(block_group);
7927  if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7928  BTRFS_BLOCK_GROUP_RAID1 |
7929  BTRFS_BLOCK_GROUP_RAID10))
7930  factor = 2;
7931  else
7932  factor = 1;
7933 
7934  /* make sure this block group isn't part of an allocation cluster */
7935  cluster = &root->fs_info->data_alloc_cluster;
7936  spin_lock(&cluster->refill_lock);
7937  btrfs_return_cluster_to_free_space(block_group, cluster);
7938  spin_unlock(&cluster->refill_lock);
7939 
7940  /*
7941  * make sure this block group isn't part of a metadata
7942  * allocation cluster
7943  */
7944  cluster = &root->fs_info->meta_alloc_cluster;
7945  spin_lock(&cluster->refill_lock);
7946  btrfs_return_cluster_to_free_space(block_group, cluster);
7947  spin_unlock(&cluster->refill_lock);
7948 
7949  path = btrfs_alloc_path();
7950  if (!path) {
7951  ret = -ENOMEM;
7952  goto out;
7953  }
7954 
7955  inode = lookup_free_space_inode(tree_root, block_group, path);
7956  if (!IS_ERR(inode)) {
7957  ret = btrfs_orphan_add(trans, inode);
7958  if (ret) {
7959  btrfs_add_delayed_iput(inode);
7960  goto out;
7961  }
7962  clear_nlink(inode);
7963  /* One for the block groups ref */
7964  spin_lock(&block_group->lock);
7965  if (block_group->iref) {
7966  block_group->iref = 0;
7967  block_group->inode = NULL;
7968  spin_unlock(&block_group->lock);
7969  iput(inode);
7970  } else {
7971  spin_unlock(&block_group->lock);
7972  }
7973  /* One for our lookup ref */
7974  btrfs_add_delayed_iput(inode);
7975  }
7976 
7978  key.offset = block_group->key.objectid;
7979  key.type = 0;
7980 
7981  ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7982  if (ret < 0)
7983  goto out;
7984  if (ret > 0)
7985  btrfs_release_path(path);
7986  if (ret == 0) {
7987  ret = btrfs_del_item(trans, tree_root, path);
7988  if (ret)
7989  goto out;
7990  btrfs_release_path(path);
7991  }
7992 
7993  spin_lock(&root->fs_info->block_group_cache_lock);
7994  rb_erase(&block_group->cache_node,
7995  &root->fs_info->block_group_cache_tree);
7996  spin_unlock(&root->fs_info->block_group_cache_lock);
7997 
7998  down_write(&block_group->space_info->groups_sem);
7999  /*
8000  * we must use list_del_init so people can check to see if they
8001  * are still on the list after taking the semaphore
8002  */
8003  list_del_init(&block_group->list);
8004  if (list_empty(&block_group->space_info->block_groups[index]))
8005  clear_avail_alloc_bits(root->fs_info, block_group->flags);
8006  up_write(&block_group->space_info->groups_sem);
8007 
8008  if (block_group->cached == BTRFS_CACHE_STARTED)
8009  wait_block_group_cache_done(block_group);
8010 
8011  btrfs_remove_free_space_cache(block_group);
8012 
8013  spin_lock(&block_group->space_info->lock);
8014  block_group->space_info->total_bytes -= block_group->key.offset;
8015  block_group->space_info->bytes_readonly -= block_group->key.offset;
8016  block_group->space_info->disk_total -= block_group->key.offset * factor;
8017  spin_unlock(&block_group->space_info->lock);
8018 
8019  memcpy(&key, &block_group->key, sizeof(key));
8020 
8022 
8023  btrfs_put_block_group(block_group);
8024  btrfs_put_block_group(block_group);
8025 
8026  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8027  if (ret > 0)
8028  ret = -EIO;
8029  if (ret < 0)
8030  goto out;
8031 
8032  ret = btrfs_del_item(trans, root, path);
8033 out:
8034  btrfs_free_path(path);
8035  return ret;
8036 }
8037 
8039 {
8040  struct btrfs_space_info *space_info;
8041  struct btrfs_super_block *disk_super;
8042  u64 features;
8043  u64 flags;
8044  int mixed = 0;
8045  int ret;
8046 
8047  disk_super = fs_info->super_copy;
8048  if (!btrfs_super_root(disk_super))
8049  return 1;
8050 
8051  features = btrfs_super_incompat_flags(disk_super);
8052  if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8053  mixed = 1;
8054 
8055  flags = BTRFS_BLOCK_GROUP_SYSTEM;
8056  ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8057  if (ret)
8058  goto out;
8059 
8060  if (mixed) {
8061  flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8062  ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8063  } else {
8065  ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8066  if (ret)
8067  goto out;
8068 
8069  flags = BTRFS_BLOCK_GROUP_DATA;
8070  ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8071  }
8072 out:
8073  return ret;
8074 }
8075 
8076 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8077 {
8078  return unpin_extent_range(root, start, end);
8079 }
8080 
8081 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8082  u64 num_bytes, u64 *actual_bytes)
8083 {
8084  return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8085 }
8086 
8087 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8088 {
8089  struct btrfs_fs_info *fs_info = root->fs_info;
8090  struct btrfs_block_group_cache *cache = NULL;
8091  u64 group_trimmed;
8092  u64 start;
8093  u64 end;
8094  u64 trimmed = 0;
8095  u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8096  int ret = 0;
8097 
8098  /*
8099  * try to trim all FS space, our block group may start from non-zero.
8100  */
8101  if (range->len == total_bytes)
8102  cache = btrfs_lookup_first_block_group(fs_info, range->start);
8103  else
8104  cache = btrfs_lookup_block_group(fs_info, range->start);
8105 
8106  while (cache) {
8107  if (cache->key.objectid >= (range->start + range->len)) {
8108  btrfs_put_block_group(cache);
8109  break;
8110  }
8111 
8112  start = max(range->start, cache->key.objectid);
8113  end = min(range->start + range->len,
8114  cache->key.objectid + cache->key.offset);
8115 
8116  if (end - start >= range->minlen) {
8117  if (!block_group_cache_done(cache)) {
8118  ret = cache_block_group(cache, NULL, root, 0);
8119  if (!ret)
8120  wait_block_group_cache_done(cache);
8121  }
8122  ret = btrfs_trim_block_group(cache,
8123  &group_trimmed,
8124  start,
8125  end,
8126  range->minlen);
8127 
8128  trimmed += group_trimmed;
8129  if (ret) {
8130  btrfs_put_block_group(cache);
8131  break;
8132  }
8133  }
8134 
8135  cache = next_block_group(fs_info->tree_root, cache);
8136  }
8137 
8138  range->len = trimmed;
8139  return ret;
8140 }