Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
tree-log.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2008 Oracle. All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/list_sort.h>
22 #include "ctree.h"
23 #include "transaction.h"
24 #include "disk-io.h"
25 #include "locking.h"
26 #include "print-tree.h"
27 #include "backref.h"
28 #include "compat.h"
29 #include "tree-log.h"
30 #include "hash.h"
31 
32 /* magic values for the inode_only field in btrfs_log_inode:
33  *
34  * LOG_INODE_ALL means to log everything
35  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36  * during log replay
37  */
38 #define LOG_INODE_ALL 0
39 #define LOG_INODE_EXISTS 1
40 
41 /*
42  * directory trouble cases
43  *
44  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
45  * log, we must force a full commit before doing an fsync of the directory
46  * where the unlink was done.
47  * ---> record transid of last unlink/rename per directory
48  *
49  * mkdir foo/some_dir
50  * normal commit
51  * rename foo/some_dir foo2/some_dir
52  * mkdir foo/some_dir
53  * fsync foo/some_dir/some_file
54  *
55  * The fsync above will unlink the original some_dir without recording
56  * it in its new location (foo2). After a crash, some_dir will be gone
57  * unless the fsync of some_file forces a full commit
58  *
59  * 2) we must log any new names for any file or dir that is in the fsync
60  * log. ---> check inode while renaming/linking.
61  *
62  * 2a) we must log any new names for any file or dir during rename
63  * when the directory they are being removed from was logged.
64  * ---> check inode and old parent dir during rename
65  *
66  * 2a is actually the more important variant. With the extra logging
67  * a crash might unlink the old name without recreating the new one
68  *
69  * 3) after a crash, we must go through any directories with a link count
70  * of zero and redo the rm -rf
71  *
72  * mkdir f1/foo
73  * normal commit
74  * rm -rf f1/foo
75  * fsync(f1)
76  *
77  * The directory f1 was fully removed from the FS, but fsync was never
78  * called on f1, only its parent dir. After a crash the rm -rf must
79  * be replayed. This must be able to recurse down the entire
80  * directory tree. The inode link count fixup code takes care of the
81  * ugly details.
82  */
83 
84 /*
85  * stages for the tree walking. The first
86  * stage (0) is to only pin down the blocks we find
87  * the second stage (1) is to make sure that all the inodes
88  * we find in the log are created in the subvolume.
89  *
90  * The last stage is to deal with directories and links and extents
91  * and all the other fun semantics
92  */
93 #define LOG_WALK_PIN_ONLY 0
94 #define LOG_WALK_REPLAY_INODES 1
95 #define LOG_WALK_REPLAY_ALL 2
96 
97 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
98  struct btrfs_root *root, struct inode *inode,
99  int inode_only);
100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101  struct btrfs_root *root,
102  struct btrfs_path *path, u64 objectid);
103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104  struct btrfs_root *root,
105  struct btrfs_root *log,
106  struct btrfs_path *path,
107  u64 dirid, int del_all);
108 
109 /*
110  * tree logging is a special write ahead log used to make sure that
111  * fsyncs and O_SYNCs can happen without doing full tree commits.
112  *
113  * Full tree commits are expensive because they require commonly
114  * modified blocks to be recowed, creating many dirty pages in the
115  * extent tree an 4x-6x higher write load than ext3.
116  *
117  * Instead of doing a tree commit on every fsync, we use the
118  * key ranges and transaction ids to find items for a given file or directory
119  * that have changed in this transaction. Those items are copied into
120  * a special tree (one per subvolume root), that tree is written to disk
121  * and then the fsync is considered complete.
122  *
123  * After a crash, items are copied out of the log-tree back into the
124  * subvolume tree. Any file data extents found are recorded in the extent
125  * allocation tree, and the log-tree freed.
126  *
127  * The log tree is read three times, once to pin down all the extents it is
128  * using in ram and once, once to create all the inodes logged in the tree
129  * and once to do all the other items.
130  */
131 
132 /*
133  * start a sub transaction and setup the log tree
134  * this increments the log tree writer count to make the people
135  * syncing the tree wait for us to finish
136  */
137 static int start_log_trans(struct btrfs_trans_handle *trans,
138  struct btrfs_root *root)
139 {
140  int ret;
141  int err = 0;
142 
143  mutex_lock(&root->log_mutex);
144  if (root->log_root) {
145  if (!root->log_start_pid) {
146  root->log_start_pid = current->pid;
147  root->log_multiple_pids = false;
148  } else if (root->log_start_pid != current->pid) {
149  root->log_multiple_pids = true;
150  }
151 
152  atomic_inc(&root->log_batch);
153  atomic_inc(&root->log_writers);
154  mutex_unlock(&root->log_mutex);
155  return 0;
156  }
157  root->log_multiple_pids = false;
158  root->log_start_pid = current->pid;
159  mutex_lock(&root->fs_info->tree_log_mutex);
160  if (!root->fs_info->log_root_tree) {
161  ret = btrfs_init_log_root_tree(trans, root->fs_info);
162  if (ret)
163  err = ret;
164  }
165  if (err == 0 && !root->log_root) {
166  ret = btrfs_add_log_tree(trans, root);
167  if (ret)
168  err = ret;
169  }
170  mutex_unlock(&root->fs_info->tree_log_mutex);
171  atomic_inc(&root->log_batch);
172  atomic_inc(&root->log_writers);
173  mutex_unlock(&root->log_mutex);
174  return err;
175 }
176 
177 /*
178  * returns 0 if there was a log transaction running and we were able
179  * to join, or returns -ENOENT if there were not transactions
180  * in progress
181  */
182 static int join_running_log_trans(struct btrfs_root *root)
183 {
184  int ret = -ENOENT;
185 
186  smp_mb();
187  if (!root->log_root)
188  return -ENOENT;
189 
190  mutex_lock(&root->log_mutex);
191  if (root->log_root) {
192  ret = 0;
193  atomic_inc(&root->log_writers);
194  }
195  mutex_unlock(&root->log_mutex);
196  return ret;
197 }
198 
199 /*
200  * This either makes the current running log transaction wait
201  * until you call btrfs_end_log_trans() or it makes any future
202  * log transactions wait until you call btrfs_end_log_trans()
203  */
205 {
206  int ret = -ENOENT;
207 
208  mutex_lock(&root->log_mutex);
209  atomic_inc(&root->log_writers);
210  mutex_unlock(&root->log_mutex);
211  return ret;
212 }
213 
214 /*
215  * indicate we're done making changes to the log tree
216  * and wake up anyone waiting to do a sync
217  */
218 void btrfs_end_log_trans(struct btrfs_root *root)
219 {
220  if (atomic_dec_and_test(&root->log_writers)) {
221  smp_mb();
222  if (waitqueue_active(&root->log_writer_wait))
223  wake_up(&root->log_writer_wait);
224  }
225 }
226 
227 
228 /*
229  * the walk control struct is used to pass state down the chain when
230  * processing the log tree. The stage field tells us which part
231  * of the log tree processing we are currently doing. The others
232  * are state fields used for that specific part
233  */
234 struct walk_control {
235  /* should we free the extent on disk when done? This is used
236  * at transaction commit time while freeing a log tree
237  */
238  int free;
239 
240  /* should we write out the extent buffer? This is used
241  * while flushing the log tree to disk during a sync
242  */
243  int write;
244 
245  /* should we wait for the extent buffer io to finish? Also used
246  * while flushing the log tree to disk for a sync
247  */
248  int wait;
249 
250  /* pin only walk, we record which extents on disk belong to the
251  * log trees
252  */
253  int pin;
254 
255  /* what stage of the replay code we're currently in */
256  int stage;
257 
258  /* the root we are currently replaying */
260 
261  /* the trans handle for the current replay */
263 
264  /* the function that gets used to process blocks we find in the
265  * tree. Note the extent_buffer might not be up to date when it is
266  * passed in, and it must be checked or read if you need the data
267  * inside it
268  */
270  struct walk_control *wc, u64 gen);
271 };
272 
273 /*
274  * process_func used to pin down extents, write them or wait on them
275  */
276 static int process_one_buffer(struct btrfs_root *log,
277  struct extent_buffer *eb,
278  struct walk_control *wc, u64 gen)
279 {
280  if (wc->pin)
282  log->fs_info->extent_root,
283  eb->start, eb->len);
284 
285  if (btrfs_buffer_uptodate(eb, gen, 0)) {
286  if (wc->write)
288  if (wc->wait)
290  }
291  return 0;
292 }
293 
294 /*
295  * Item overwrite used by replay and tree logging. eb, slot and key all refer
296  * to the src data we are copying out.
297  *
298  * root is the tree we are copying into, and path is a scratch
299  * path for use in this function (it should be released on entry and
300  * will be released on exit).
301  *
302  * If the key is already in the destination tree the existing item is
303  * overwritten. If the existing item isn't big enough, it is extended.
304  * If it is too large, it is truncated.
305  *
306  * If the key isn't in the destination yet, a new item is inserted.
307  */
308 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
309  struct btrfs_root *root,
310  struct btrfs_path *path,
311  struct extent_buffer *eb, int slot,
312  struct btrfs_key *key)
313 {
314  int ret;
315  u32 item_size;
316  u64 saved_i_size = 0;
317  int save_old_i_size = 0;
318  unsigned long src_ptr;
319  unsigned long dst_ptr;
320  int overwrite_root = 0;
321 
322  if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
323  overwrite_root = 1;
324 
325  item_size = btrfs_item_size_nr(eb, slot);
326  src_ptr = btrfs_item_ptr_offset(eb, slot);
327 
328  /* look for the key in the destination tree */
329  ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
330  if (ret == 0) {
331  char *src_copy;
332  char *dst_copy;
333  u32 dst_size = btrfs_item_size_nr(path->nodes[0],
334  path->slots[0]);
335  if (dst_size != item_size)
336  goto insert;
337 
338  if (item_size == 0) {
339  btrfs_release_path(path);
340  return 0;
341  }
342  dst_copy = kmalloc(item_size, GFP_NOFS);
343  src_copy = kmalloc(item_size, GFP_NOFS);
344  if (!dst_copy || !src_copy) {
345  btrfs_release_path(path);
346  kfree(dst_copy);
347  kfree(src_copy);
348  return -ENOMEM;
349  }
350 
351  read_extent_buffer(eb, src_copy, src_ptr, item_size);
352 
353  dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
354  read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
355  item_size);
356  ret = memcmp(dst_copy, src_copy, item_size);
357 
358  kfree(dst_copy);
359  kfree(src_copy);
360  /*
361  * they have the same contents, just return, this saves
362  * us from cowing blocks in the destination tree and doing
363  * extra writes that may not have been done by a previous
364  * sync
365  */
366  if (ret == 0) {
367  btrfs_release_path(path);
368  return 0;
369  }
370 
371  }
372 insert:
373  btrfs_release_path(path);
374  /* try to insert the key into the destination tree */
375  ret = btrfs_insert_empty_item(trans, root, path,
376  key, item_size);
377 
378  /* make sure any existing item is the correct size */
379  if (ret == -EEXIST) {
380  u32 found_size;
381  found_size = btrfs_item_size_nr(path->nodes[0],
382  path->slots[0]);
383  if (found_size > item_size)
384  btrfs_truncate_item(trans, root, path, item_size, 1);
385  else if (found_size < item_size)
386  btrfs_extend_item(trans, root, path,
387  item_size - found_size);
388  } else if (ret) {
389  return ret;
390  }
391  dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
392  path->slots[0]);
393 
394  /* don't overwrite an existing inode if the generation number
395  * was logged as zero. This is done when the tree logging code
396  * is just logging an inode to make sure it exists after recovery.
397  *
398  * Also, don't overwrite i_size on directories during replay.
399  * log replay inserts and removes directory items based on the
400  * state of the tree found in the subvolume, and i_size is modified
401  * as it goes
402  */
403  if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
404  struct btrfs_inode_item *src_item;
405  struct btrfs_inode_item *dst_item;
406 
407  src_item = (struct btrfs_inode_item *)src_ptr;
408  dst_item = (struct btrfs_inode_item *)dst_ptr;
409 
410  if (btrfs_inode_generation(eb, src_item) == 0)
411  goto no_copy;
412 
413  if (overwrite_root &&
414  S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
415  S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
416  save_old_i_size = 1;
417  saved_i_size = btrfs_inode_size(path->nodes[0],
418  dst_item);
419  }
420  }
421 
422  copy_extent_buffer(path->nodes[0], eb, dst_ptr,
423  src_ptr, item_size);
424 
425  if (save_old_i_size) {
426  struct btrfs_inode_item *dst_item;
427  dst_item = (struct btrfs_inode_item *)dst_ptr;
428  btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
429  }
430 
431  /* make sure the generation is filled in */
432  if (key->type == BTRFS_INODE_ITEM_KEY) {
433  struct btrfs_inode_item *dst_item;
434  dst_item = (struct btrfs_inode_item *)dst_ptr;
435  if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
436  btrfs_set_inode_generation(path->nodes[0], dst_item,
437  trans->transid);
438  }
439  }
440 no_copy:
441  btrfs_mark_buffer_dirty(path->nodes[0]);
442  btrfs_release_path(path);
443  return 0;
444 }
445 
446 /*
447  * simple helper to read an inode off the disk from a given root
448  * This can only be called for subvolume roots and not for the log
449  */
450 static noinline struct inode *read_one_inode(struct btrfs_root *root,
451  u64 objectid)
452 {
453  struct btrfs_key key;
454  struct inode *inode;
455 
456  key.objectid = objectid;
458  key.offset = 0;
459  inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
460  if (IS_ERR(inode)) {
461  inode = NULL;
462  } else if (is_bad_inode(inode)) {
463  iput(inode);
464  inode = NULL;
465  }
466  return inode;
467 }
468 
469 /* replays a single extent in 'eb' at 'slot' with 'key' into the
470  * subvolume 'root'. path is released on entry and should be released
471  * on exit.
472  *
473  * extents in the log tree have not been allocated out of the extent
474  * tree yet. So, this completes the allocation, taking a reference
475  * as required if the extent already exists or creating a new extent
476  * if it isn't in the extent allocation tree yet.
477  *
478  * The extent is inserted into the file, dropping any existing extents
479  * from the file that overlap the new one.
480  */
481 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
482  struct btrfs_root *root,
483  struct btrfs_path *path,
484  struct extent_buffer *eb, int slot,
485  struct btrfs_key *key)
486 {
487  int found_type;
488  u64 mask = root->sectorsize - 1;
489  u64 extent_end;
490  u64 start = key->offset;
491  u64 saved_nbytes;
493  struct inode *inode = NULL;
494  unsigned long size;
495  int ret = 0;
496 
497  item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
498  found_type = btrfs_file_extent_type(eb, item);
499 
500  if (found_type == BTRFS_FILE_EXTENT_REG ||
501  found_type == BTRFS_FILE_EXTENT_PREALLOC)
502  extent_end = start + btrfs_file_extent_num_bytes(eb, item);
503  else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
504  size = btrfs_file_extent_inline_len(eb, item);
505  extent_end = (start + size + mask) & ~mask;
506  } else {
507  ret = 0;
508  goto out;
509  }
510 
511  inode = read_one_inode(root, key->objectid);
512  if (!inode) {
513  ret = -EIO;
514  goto out;
515  }
516 
517  /*
518  * first check to see if we already have this extent in the
519  * file. This must be done before the btrfs_drop_extents run
520  * so we don't try to drop this extent.
521  */
522  ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
523  start, 0);
524 
525  if (ret == 0 &&
526  (found_type == BTRFS_FILE_EXTENT_REG ||
527  found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
528  struct btrfs_file_extent_item cmp1;
529  struct btrfs_file_extent_item cmp2;
530  struct btrfs_file_extent_item *existing;
531  struct extent_buffer *leaf;
532 
533  leaf = path->nodes[0];
534  existing = btrfs_item_ptr(leaf, path->slots[0],
535  struct btrfs_file_extent_item);
536 
537  read_extent_buffer(eb, &cmp1, (unsigned long)item,
538  sizeof(cmp1));
539  read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
540  sizeof(cmp2));
541 
542  /*
543  * we already have a pointer to this exact extent,
544  * we don't have to do anything
545  */
546  if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
547  btrfs_release_path(path);
548  goto out;
549  }
550  }
551  btrfs_release_path(path);
552 
553  saved_nbytes = inode_get_bytes(inode);
554  /* drop any overlapping extents */
555  ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
556  BUG_ON(ret);
557 
558  if (found_type == BTRFS_FILE_EXTENT_REG ||
559  found_type == BTRFS_FILE_EXTENT_PREALLOC) {
560  u64 offset;
561  unsigned long dest_offset;
562  struct btrfs_key ins;
563 
564  ret = btrfs_insert_empty_item(trans, root, path, key,
565  sizeof(*item));
566  BUG_ON(ret);
567  dest_offset = btrfs_item_ptr_offset(path->nodes[0],
568  path->slots[0]);
569  copy_extent_buffer(path->nodes[0], eb, dest_offset,
570  (unsigned long)item, sizeof(*item));
571 
572  ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
573  ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
574  ins.type = BTRFS_EXTENT_ITEM_KEY;
575  offset = key->offset - btrfs_file_extent_offset(eb, item);
576 
577  if (ins.objectid > 0) {
578  u64 csum_start;
579  u64 csum_end;
580  LIST_HEAD(ordered_sums);
581  /*
582  * is this extent already allocated in the extent
583  * allocation tree? If so, just add a reference
584  */
585  ret = btrfs_lookup_extent(root, ins.objectid,
586  ins.offset);
587  if (ret == 0) {
588  ret = btrfs_inc_extent_ref(trans, root,
589  ins.objectid, ins.offset,
590  0, root->root_key.objectid,
591  key->objectid, offset, 0);
592  BUG_ON(ret);
593  } else {
594  /*
595  * insert the extent pointer in the extent
596  * allocation tree
597  */
598  ret = btrfs_alloc_logged_file_extent(trans,
599  root, root->root_key.objectid,
600  key->objectid, offset, &ins);
601  BUG_ON(ret);
602  }
603  btrfs_release_path(path);
604 
605  if (btrfs_file_extent_compression(eb, item)) {
606  csum_start = ins.objectid;
607  csum_end = csum_start + ins.offset;
608  } else {
609  csum_start = ins.objectid +
610  btrfs_file_extent_offset(eb, item);
611  csum_end = csum_start +
612  btrfs_file_extent_num_bytes(eb, item);
613  }
614 
616  csum_start, csum_end - 1,
617  &ordered_sums, 0);
618  BUG_ON(ret);
619  while (!list_empty(&ordered_sums)) {
620  struct btrfs_ordered_sum *sums;
621  sums = list_entry(ordered_sums.next,
622  struct btrfs_ordered_sum,
623  list);
624  ret = btrfs_csum_file_blocks(trans,
625  root->fs_info->csum_root,
626  sums);
627  BUG_ON(ret);
628  list_del(&sums->list);
629  kfree(sums);
630  }
631  } else {
632  btrfs_release_path(path);
633  }
634  } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
635  /* inline extents are easy, we just overwrite them */
636  ret = overwrite_item(trans, root, path, eb, slot, key);
637  BUG_ON(ret);
638  }
639 
640  inode_set_bytes(inode, saved_nbytes);
641  ret = btrfs_update_inode(trans, root, inode);
642 out:
643  if (inode)
644  iput(inode);
645  return ret;
646 }
647 
648 /*
649  * when cleaning up conflicts between the directory names in the
650  * subvolume, directory names in the log and directory names in the
651  * inode back references, we may have to unlink inodes from directories.
652  *
653  * This is a helper function to do the unlink of a specific directory
654  * item
655  */
656 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
657  struct btrfs_root *root,
658  struct btrfs_path *path,
659  struct inode *dir,
660  struct btrfs_dir_item *di)
661 {
662  struct inode *inode;
663  char *name;
664  int name_len;
665  struct extent_buffer *leaf;
666  struct btrfs_key location;
667  int ret;
668 
669  leaf = path->nodes[0];
670 
671  btrfs_dir_item_key_to_cpu(leaf, di, &location);
672  name_len = btrfs_dir_name_len(leaf, di);
673  name = kmalloc(name_len, GFP_NOFS);
674  if (!name)
675  return -ENOMEM;
676 
677  read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
678  btrfs_release_path(path);
679 
680  inode = read_one_inode(root, location.objectid);
681  if (!inode) {
682  kfree(name);
683  return -EIO;
684  }
685 
686  ret = link_to_fixup_dir(trans, root, path, location.objectid);
687  BUG_ON(ret);
688 
689  ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
690  BUG_ON(ret);
691  kfree(name);
692 
693  iput(inode);
694 
695  btrfs_run_delayed_items(trans, root);
696  return ret;
697 }
698 
699 /*
700  * helper function to see if a given name and sequence number found
701  * in an inode back reference are already in a directory and correctly
702  * point to this inode
703  */
704 static noinline int inode_in_dir(struct btrfs_root *root,
705  struct btrfs_path *path,
706  u64 dirid, u64 objectid, u64 index,
707  const char *name, int name_len)
708 {
709  struct btrfs_dir_item *di;
710  struct btrfs_key location;
711  int match = 0;
712 
713  di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
714  index, name, name_len, 0);
715  if (di && !IS_ERR(di)) {
716  btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
717  if (location.objectid != objectid)
718  goto out;
719  } else
720  goto out;
721  btrfs_release_path(path);
722 
723  di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
724  if (di && !IS_ERR(di)) {
725  btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
726  if (location.objectid != objectid)
727  goto out;
728  } else
729  goto out;
730  match = 1;
731 out:
732  btrfs_release_path(path);
733  return match;
734 }
735 
736 /*
737  * helper function to check a log tree for a named back reference in
738  * an inode. This is used to decide if a back reference that is
739  * found in the subvolume conflicts with what we find in the log.
740  *
741  * inode backreferences may have multiple refs in a single item,
742  * during replay we process one reference at a time, and we don't
743  * want to delete valid links to a file from the subvolume if that
744  * link is also in the log.
745  */
746 static noinline int backref_in_log(struct btrfs_root *log,
747  struct btrfs_key *key,
748  u64 ref_objectid,
749  char *name, int namelen)
750 {
751  struct btrfs_path *path;
752  struct btrfs_inode_ref *ref;
753  unsigned long ptr;
754  unsigned long ptr_end;
755  unsigned long name_ptr;
756  int found_name_len;
757  int item_size;
758  int ret;
759  int match = 0;
760 
761  path = btrfs_alloc_path();
762  if (!path)
763  return -ENOMEM;
764 
765  ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766  if (ret != 0)
767  goto out;
768 
769  ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
770 
771  if (key->type == BTRFS_INODE_EXTREF_KEY) {
772  if (btrfs_find_name_in_ext_backref(path, ref_objectid,
773  name, namelen, NULL))
774  match = 1;
775 
776  goto out;
777  }
778 
779  item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
780  ptr_end = ptr + item_size;
781  while (ptr < ptr_end) {
782  ref = (struct btrfs_inode_ref *)ptr;
783  found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
784  if (found_name_len == namelen) {
785  name_ptr = (unsigned long)(ref + 1);
786  ret = memcmp_extent_buffer(path->nodes[0], name,
787  name_ptr, namelen);
788  if (ret == 0) {
789  match = 1;
790  goto out;
791  }
792  }
793  ptr = (unsigned long)(ref + 1) + found_name_len;
794  }
795 out:
796  btrfs_free_path(path);
797  return match;
798 }
799 
800 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
801  struct btrfs_root *root,
802  struct btrfs_path *path,
803  struct btrfs_root *log_root,
804  struct inode *dir, struct inode *inode,
805  struct extent_buffer *eb,
806  u64 inode_objectid, u64 parent_objectid,
807  u64 ref_index, char *name, int namelen,
808  int *search_done)
809 {
810  int ret;
811  char *victim_name;
812  int victim_name_len;
813  struct extent_buffer *leaf;
814  struct btrfs_dir_item *di;
815  struct btrfs_key search_key;
816  struct btrfs_inode_extref *extref;
817 
818 again:
819  /* Search old style refs */
820  search_key.objectid = inode_objectid;
821  search_key.type = BTRFS_INODE_REF_KEY;
822  search_key.offset = parent_objectid;
823  ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
824  if (ret == 0) {
825  struct btrfs_inode_ref *victim_ref;
826  unsigned long ptr;
827  unsigned long ptr_end;
828 
829  leaf = path->nodes[0];
830 
831  /* are we trying to overwrite a back ref for the root directory
832  * if so, just jump out, we're done
833  */
834  if (search_key.objectid == search_key.offset)
835  return 1;
836 
837  /* check all the names in this back reference to see
838  * if they are in the log. if so, we allow them to stay
839  * otherwise they must be unlinked as a conflict
840  */
841  ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
842  ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
843  while (ptr < ptr_end) {
844  victim_ref = (struct btrfs_inode_ref *)ptr;
845  victim_name_len = btrfs_inode_ref_name_len(leaf,
846  victim_ref);
847  victim_name = kmalloc(victim_name_len, GFP_NOFS);
848  BUG_ON(!victim_name);
849 
850  read_extent_buffer(leaf, victim_name,
851  (unsigned long)(victim_ref + 1),
852  victim_name_len);
853 
854  if (!backref_in_log(log_root, &search_key,
855  parent_objectid,
856  victim_name,
857  victim_name_len)) {
858  btrfs_inc_nlink(inode);
859  btrfs_release_path(path);
860 
861  ret = btrfs_unlink_inode(trans, root, dir,
862  inode, victim_name,
863  victim_name_len);
864  BUG_ON(ret);
865  btrfs_run_delayed_items(trans, root);
866  kfree(victim_name);
867  *search_done = 1;
868  goto again;
869  }
870  kfree(victim_name);
871 
872  ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
873  }
874  BUG_ON(ret);
875 
876  /*
877  * NOTE: we have searched root tree and checked the
878  * coresponding ref, it does not need to check again.
879  */
880  *search_done = 1;
881  }
882  btrfs_release_path(path);
883 
884  /* Same search but for extended refs */
885  extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
886  inode_objectid, parent_objectid, 0,
887  0);
888  if (!IS_ERR_OR_NULL(extref)) {
889  u32 item_size;
890  u32 cur_offset = 0;
891  unsigned long base;
892  struct inode *victim_parent;
893 
894  leaf = path->nodes[0];
895 
896  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
897  base = btrfs_item_ptr_offset(leaf, path->slots[0]);
898 
899  while (cur_offset < item_size) {
900  extref = (struct btrfs_inode_extref *)base + cur_offset;
901 
902  victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
903 
904  if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
905  goto next;
906 
907  victim_name = kmalloc(victim_name_len, GFP_NOFS);
908  read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
909  victim_name_len);
910 
911  search_key.objectid = inode_objectid;
912  search_key.type = BTRFS_INODE_EXTREF_KEY;
913  search_key.offset = btrfs_extref_hash(parent_objectid,
914  victim_name,
915  victim_name_len);
916  ret = 0;
917  if (!backref_in_log(log_root, &search_key,
918  parent_objectid, victim_name,
919  victim_name_len)) {
920  ret = -ENOENT;
921  victim_parent = read_one_inode(root,
922  parent_objectid);
923  if (victim_parent) {
924  btrfs_inc_nlink(inode);
925  btrfs_release_path(path);
926 
927  ret = btrfs_unlink_inode(trans, root,
928  victim_parent,
929  inode,
930  victim_name,
931  victim_name_len);
932  btrfs_run_delayed_items(trans, root);
933  }
934  BUG_ON(ret);
935  iput(victim_parent);
936  kfree(victim_name);
937  *search_done = 1;
938  goto again;
939  }
940  kfree(victim_name);
941  BUG_ON(ret);
942 next:
943  cur_offset += victim_name_len + sizeof(*extref);
944  }
945  *search_done = 1;
946  }
947  btrfs_release_path(path);
948 
949  /* look for a conflicting sequence number */
950  di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
951  ref_index, name, namelen, 0);
952  if (di && !IS_ERR(di)) {
953  ret = drop_one_dir_item(trans, root, path, dir, di);
954  BUG_ON(ret);
955  }
956  btrfs_release_path(path);
957 
958  /* look for a conflicing name */
959  di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
960  name, namelen, 0);
961  if (di && !IS_ERR(di)) {
962  ret = drop_one_dir_item(trans, root, path, dir, di);
963  BUG_ON(ret);
964  }
965  btrfs_release_path(path);
966 
967  return 0;
968 }
969 
970 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
971  u32 *namelen, char **name, u64 *index,
972  u64 *parent_objectid)
973 {
974  struct btrfs_inode_extref *extref;
975 
976  extref = (struct btrfs_inode_extref *)ref_ptr;
977 
978  *namelen = btrfs_inode_extref_name_len(eb, extref);
979  *name = kmalloc(*namelen, GFP_NOFS);
980  if (*name == NULL)
981  return -ENOMEM;
982 
983  read_extent_buffer(eb, *name, (unsigned long)&extref->name,
984  *namelen);
985 
986  *index = btrfs_inode_extref_index(eb, extref);
987  if (parent_objectid)
988  *parent_objectid = btrfs_inode_extref_parent(eb, extref);
989 
990  return 0;
991 }
992 
993 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
994  u32 *namelen, char **name, u64 *index)
995 {
996  struct btrfs_inode_ref *ref;
997 
998  ref = (struct btrfs_inode_ref *)ref_ptr;
999 
1000  *namelen = btrfs_inode_ref_name_len(eb, ref);
1001  *name = kmalloc(*namelen, GFP_NOFS);
1002  if (*name == NULL)
1003  return -ENOMEM;
1004 
1005  read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1006 
1007  *index = btrfs_inode_ref_index(eb, ref);
1008 
1009  return 0;
1010 }
1011 
1012 /*
1013  * replay one inode back reference item found in the log tree.
1014  * eb, slot and key refer to the buffer and key found in the log tree.
1015  * root is the destination we are replaying into, and path is for temp
1016  * use by this function. (it should be released on return).
1017  */
1018 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1019  struct btrfs_root *root,
1020  struct btrfs_root *log,
1021  struct btrfs_path *path,
1022  struct extent_buffer *eb, int slot,
1023  struct btrfs_key *key)
1024 {
1025  struct inode *dir;
1026  struct inode *inode;
1027  unsigned long ref_ptr;
1028  unsigned long ref_end;
1029  char *name;
1030  int namelen;
1031  int ret;
1032  int search_done = 0;
1033  int log_ref_ver = 0;
1035  u64 inode_objectid;
1036  u64 ref_index = 0;
1037  int ref_struct_size;
1038 
1039  ref_ptr = btrfs_item_ptr_offset(eb, slot);
1040  ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1041 
1042  if (key->type == BTRFS_INODE_EXTREF_KEY) {
1043  struct btrfs_inode_extref *r;
1044 
1045  ref_struct_size = sizeof(struct btrfs_inode_extref);
1046  log_ref_ver = 1;
1047  r = (struct btrfs_inode_extref *)ref_ptr;
1048  parent_objectid = btrfs_inode_extref_parent(eb, r);
1049  } else {
1050  ref_struct_size = sizeof(struct btrfs_inode_ref);
1051  parent_objectid = key->offset;
1052  }
1053  inode_objectid = key->objectid;
1054 
1055  /*
1056  * it is possible that we didn't log all the parent directories
1057  * for a given inode. If we don't find the dir, just don't
1058  * copy the back ref in. The link count fixup code will take
1059  * care of the rest
1060  */
1061  dir = read_one_inode(root, parent_objectid);
1062  if (!dir)
1063  return -ENOENT;
1064 
1065  inode = read_one_inode(root, inode_objectid);
1066  if (!inode) {
1067  iput(dir);
1068  return -EIO;
1069  }
1070 
1071  while (ref_ptr < ref_end) {
1072  if (log_ref_ver) {
1073  ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1074  &ref_index, &parent_objectid);
1075  /*
1076  * parent object can change from one array
1077  * item to another.
1078  */
1079  if (!dir)
1080  dir = read_one_inode(root, parent_objectid);
1081  if (!dir)
1082  return -ENOENT;
1083  } else {
1084  ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1085  &ref_index);
1086  }
1087  if (ret)
1088  return ret;
1089 
1090  /* if we already have a perfect match, we're done */
1091  if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1092  ref_index, name, namelen)) {
1093  /*
1094  * look for a conflicting back reference in the
1095  * metadata. if we find one we have to unlink that name
1096  * of the file before we add our new link. Later on, we
1097  * overwrite any existing back reference, and we don't
1098  * want to create dangling pointers in the directory.
1099  */
1100 
1101  if (!search_done) {
1102  ret = __add_inode_ref(trans, root, path, log,
1103  dir, inode, eb,
1104  inode_objectid,
1105  parent_objectid,
1106  ref_index, name, namelen,
1107  &search_done);
1108  if (ret == 1)
1109  goto out;
1110  BUG_ON(ret);
1111  }
1112 
1113  /* insert our name */
1114  ret = btrfs_add_link(trans, dir, inode, name, namelen,
1115  0, ref_index);
1116  BUG_ON(ret);
1117 
1118  btrfs_update_inode(trans, root, inode);
1119  }
1120 
1121  ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1122  kfree(name);
1123  if (log_ref_ver) {
1124  iput(dir);
1125  dir = NULL;
1126  }
1127  }
1128 
1129  /* finally write the back reference in the inode */
1130  ret = overwrite_item(trans, root, path, eb, slot, key);
1131  BUG_ON(ret);
1132 
1133 out:
1134  btrfs_release_path(path);
1135  iput(dir);
1136  iput(inode);
1137  return 0;
1138 }
1139 
1140 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1141  struct btrfs_root *root, u64 offset)
1142 {
1143  int ret;
1144  ret = btrfs_find_orphan_item(root, offset);
1145  if (ret > 0)
1146  ret = btrfs_insert_orphan_item(trans, root, offset);
1147  return ret;
1148 }
1149 
1150 static int count_inode_extrefs(struct btrfs_root *root,
1151  struct inode *inode, struct btrfs_path *path)
1152 {
1153  int ret = 0;
1154  int name_len;
1155  unsigned int nlink = 0;
1156  u32 item_size;
1157  u32 cur_offset = 0;
1158  u64 inode_objectid = btrfs_ino(inode);
1159  u64 offset = 0;
1160  unsigned long ptr;
1161  struct btrfs_inode_extref *extref;
1162  struct extent_buffer *leaf;
1163 
1164  while (1) {
1165  ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1166  &extref, &offset);
1167  if (ret)
1168  break;
1169 
1170  leaf = path->nodes[0];
1171  item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1172  ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1173 
1174  while (cur_offset < item_size) {
1175  extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1176  name_len = btrfs_inode_extref_name_len(leaf, extref);
1177 
1178  nlink++;
1179 
1180  cur_offset += name_len + sizeof(*extref);
1181  }
1182 
1183  offset++;
1184  btrfs_release_path(path);
1185  }
1186  btrfs_release_path(path);
1187 
1188  if (ret < 0)
1189  return ret;
1190  return nlink;
1191 }
1192 
1193 static int count_inode_refs(struct btrfs_root *root,
1194  struct inode *inode, struct btrfs_path *path)
1195 {
1196  int ret;
1197  struct btrfs_key key;
1198  unsigned int nlink = 0;
1199  unsigned long ptr;
1200  unsigned long ptr_end;
1201  int name_len;
1202  u64 ino = btrfs_ino(inode);
1203 
1204  key.objectid = ino;
1205  key.type = BTRFS_INODE_REF_KEY;
1206  key.offset = (u64)-1;
1207 
1208  while (1) {
1209  ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1210  if (ret < 0)
1211  break;
1212  if (ret > 0) {
1213  if (path->slots[0] == 0)
1214  break;
1215  path->slots[0]--;
1216  }
1217  btrfs_item_key_to_cpu(path->nodes[0], &key,
1218  path->slots[0]);
1219  if (key.objectid != ino ||
1220  key.type != BTRFS_INODE_REF_KEY)
1221  break;
1222  ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1223  ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1224  path->slots[0]);
1225  while (ptr < ptr_end) {
1226  struct btrfs_inode_ref *ref;
1227 
1228  ref = (struct btrfs_inode_ref *)ptr;
1229  name_len = btrfs_inode_ref_name_len(path->nodes[0],
1230  ref);
1231  ptr = (unsigned long)(ref + 1) + name_len;
1232  nlink++;
1233  }
1234 
1235  if (key.offset == 0)
1236  break;
1237  key.offset--;
1238  btrfs_release_path(path);
1239  }
1240  btrfs_release_path(path);
1241 
1242  return nlink;
1243 }
1244 
1245 /*
1246  * There are a few corners where the link count of the file can't
1247  * be properly maintained during replay. So, instead of adding
1248  * lots of complexity to the log code, we just scan the backrefs
1249  * for any file that has been through replay.
1250  *
1251  * The scan will update the link count on the inode to reflect the
1252  * number of back refs found. If it goes down to zero, the iput
1253  * will free the inode.
1254  */
1255 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1256  struct btrfs_root *root,
1257  struct inode *inode)
1258 {
1259  struct btrfs_path *path;
1260  int ret;
1261  u64 nlink = 0;
1262  u64 ino = btrfs_ino(inode);
1263 
1264  path = btrfs_alloc_path();
1265  if (!path)
1266  return -ENOMEM;
1267 
1268  ret = count_inode_refs(root, inode, path);
1269  if (ret < 0)
1270  goto out;
1271 
1272  nlink = ret;
1273 
1274  ret = count_inode_extrefs(root, inode, path);
1275  if (ret == -ENOENT)
1276  ret = 0;
1277 
1278  if (ret < 0)
1279  goto out;
1280 
1281  nlink += ret;
1282 
1283  ret = 0;
1284 
1285  if (nlink != inode->i_nlink) {
1286  set_nlink(inode, nlink);
1287  btrfs_update_inode(trans, root, inode);
1288  }
1289  BTRFS_I(inode)->index_cnt = (u64)-1;
1290 
1291  if (inode->i_nlink == 0) {
1292  if (S_ISDIR(inode->i_mode)) {
1293  ret = replay_dir_deletes(trans, root, NULL, path,
1294  ino, 1);
1295  BUG_ON(ret);
1296  }
1297  ret = insert_orphan_item(trans, root, ino);
1298  BUG_ON(ret);
1299  }
1300 
1301 out:
1302  btrfs_free_path(path);
1303  return ret;
1304 }
1305 
1306 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1307  struct btrfs_root *root,
1308  struct btrfs_path *path)
1309 {
1310  int ret;
1311  struct btrfs_key key;
1312  struct inode *inode;
1313 
1316  key.offset = (u64)-1;
1317  while (1) {
1318  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1319  if (ret < 0)
1320  break;
1321 
1322  if (ret == 1) {
1323  if (path->slots[0] == 0)
1324  break;
1325  path->slots[0]--;
1326  }
1327 
1328  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1330  key.type != BTRFS_ORPHAN_ITEM_KEY)
1331  break;
1332 
1333  ret = btrfs_del_item(trans, root, path);
1334  if (ret)
1335  goto out;
1336 
1337  btrfs_release_path(path);
1338  inode = read_one_inode(root, key.offset);
1339  if (!inode)
1340  return -EIO;
1341 
1342  ret = fixup_inode_link_count(trans, root, inode);
1343  BUG_ON(ret);
1344 
1345  iput(inode);
1346 
1347  /*
1348  * fixup on a directory may create new entries,
1349  * make sure we always look for the highset possible
1350  * offset
1351  */
1352  key.offset = (u64)-1;
1353  }
1354  ret = 0;
1355 out:
1356  btrfs_release_path(path);
1357  return ret;
1358 }
1359 
1360 
1361 /*
1362  * record a given inode in the fixup dir so we can check its link
1363  * count when replay is done. The link count is incremented here
1364  * so the inode won't go away until we check it
1365  */
1366 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1367  struct btrfs_root *root,
1368  struct btrfs_path *path,
1369  u64 objectid)
1370 {
1371  struct btrfs_key key;
1372  int ret = 0;
1373  struct inode *inode;
1374 
1375  inode = read_one_inode(root, objectid);
1376  if (!inode)
1377  return -EIO;
1378 
1380  btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1381  key.offset = objectid;
1382 
1383  ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1384 
1385  btrfs_release_path(path);
1386  if (ret == 0) {
1387  btrfs_inc_nlink(inode);
1388  ret = btrfs_update_inode(trans, root, inode);
1389  } else if (ret == -EEXIST) {
1390  ret = 0;
1391  } else {
1392  BUG();
1393  }
1394  iput(inode);
1395 
1396  return ret;
1397 }
1398 
1399 /*
1400  * when replaying the log for a directory, we only insert names
1401  * for inodes that actually exist. This means an fsync on a directory
1402  * does not implicitly fsync all the new files in it
1403  */
1404 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1405  struct btrfs_root *root,
1406  struct btrfs_path *path,
1407  u64 dirid, u64 index,
1408  char *name, int name_len, u8 type,
1409  struct btrfs_key *location)
1410 {
1411  struct inode *inode;
1412  struct inode *dir;
1413  int ret;
1414 
1415  inode = read_one_inode(root, location->objectid);
1416  if (!inode)
1417  return -ENOENT;
1418 
1419  dir = read_one_inode(root, dirid);
1420  if (!dir) {
1421  iput(inode);
1422  return -EIO;
1423  }
1424  ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1425 
1426  /* FIXME, put inode into FIXUP list */
1427 
1428  iput(inode);
1429  iput(dir);
1430  return ret;
1431 }
1432 
1433 /*
1434  * take a single entry in a log directory item and replay it into
1435  * the subvolume.
1436  *
1437  * if a conflicting item exists in the subdirectory already,
1438  * the inode it points to is unlinked and put into the link count
1439  * fix up tree.
1440  *
1441  * If a name from the log points to a file or directory that does
1442  * not exist in the FS, it is skipped. fsyncs on directories
1443  * do not force down inodes inside that directory, just changes to the
1444  * names or unlinks in a directory.
1445  */
1446 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1447  struct btrfs_root *root,
1448  struct btrfs_path *path,
1449  struct extent_buffer *eb,
1450  struct btrfs_dir_item *di,
1451  struct btrfs_key *key)
1452 {
1453  char *name;
1454  int name_len;
1455  struct btrfs_dir_item *dst_di;
1456  struct btrfs_key found_key;
1457  struct btrfs_key log_key;
1458  struct inode *dir;
1459  u8 log_type;
1460  int exists;
1461  int ret;
1462 
1463  dir = read_one_inode(root, key->objectid);
1464  if (!dir)
1465  return -EIO;
1466 
1467  name_len = btrfs_dir_name_len(eb, di);
1468  name = kmalloc(name_len, GFP_NOFS);
1469  if (!name)
1470  return -ENOMEM;
1471 
1472  log_type = btrfs_dir_type(eb, di);
1473  read_extent_buffer(eb, name, (unsigned long)(di + 1),
1474  name_len);
1475 
1476  btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1477  exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1478  if (exists == 0)
1479  exists = 1;
1480  else
1481  exists = 0;
1482  btrfs_release_path(path);
1483 
1484  if (key->type == BTRFS_DIR_ITEM_KEY) {
1485  dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1486  name, name_len, 1);
1487  } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1488  dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1489  key->objectid,
1490  key->offset, name,
1491  name_len, 1);
1492  } else {
1493  BUG();
1494  }
1495  if (IS_ERR_OR_NULL(dst_di)) {
1496  /* we need a sequence number to insert, so we only
1497  * do inserts for the BTRFS_DIR_INDEX_KEY types
1498  */
1499  if (key->type != BTRFS_DIR_INDEX_KEY)
1500  goto out;
1501  goto insert;
1502  }
1503 
1504  btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1505  /* the existing item matches the logged item */
1506  if (found_key.objectid == log_key.objectid &&
1507  found_key.type == log_key.type &&
1508  found_key.offset == log_key.offset &&
1509  btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1510  goto out;
1511  }
1512 
1513  /*
1514  * don't drop the conflicting directory entry if the inode
1515  * for the new entry doesn't exist
1516  */
1517  if (!exists)
1518  goto out;
1519 
1520  ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1521  BUG_ON(ret);
1522 
1523  if (key->type == BTRFS_DIR_INDEX_KEY)
1524  goto insert;
1525 out:
1526  btrfs_release_path(path);
1527  kfree(name);
1528  iput(dir);
1529  return 0;
1530 
1531 insert:
1532  btrfs_release_path(path);
1533  ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1534  name, name_len, log_type, &log_key);
1535 
1536  BUG_ON(ret && ret != -ENOENT);
1537  goto out;
1538 }
1539 
1540 /*
1541  * find all the names in a directory item and reconcile them into
1542  * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1543  * one name in a directory item, but the same code gets used for
1544  * both directory index types
1545  */
1546 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1547  struct btrfs_root *root,
1548  struct btrfs_path *path,
1549  struct extent_buffer *eb, int slot,
1550  struct btrfs_key *key)
1551 {
1552  int ret;
1553  u32 item_size = btrfs_item_size_nr(eb, slot);
1554  struct btrfs_dir_item *di;
1555  int name_len;
1556  unsigned long ptr;
1557  unsigned long ptr_end;
1558 
1559  ptr = btrfs_item_ptr_offset(eb, slot);
1560  ptr_end = ptr + item_size;
1561  while (ptr < ptr_end) {
1562  di = (struct btrfs_dir_item *)ptr;
1563  if (verify_dir_item(root, eb, di))
1564  return -EIO;
1565  name_len = btrfs_dir_name_len(eb, di);
1566  ret = replay_one_name(trans, root, path, eb, di, key);
1567  BUG_ON(ret);
1568  ptr = (unsigned long)(di + 1);
1569  ptr += name_len;
1570  }
1571  return 0;
1572 }
1573 
1574 /*
1575  * directory replay has two parts. There are the standard directory
1576  * items in the log copied from the subvolume, and range items
1577  * created in the log while the subvolume was logged.
1578  *
1579  * The range items tell us which parts of the key space the log
1580  * is authoritative for. During replay, if a key in the subvolume
1581  * directory is in a logged range item, but not actually in the log
1582  * that means it was deleted from the directory before the fsync
1583  * and should be removed.
1584  */
1585 static noinline int find_dir_range(struct btrfs_root *root,
1586  struct btrfs_path *path,
1587  u64 dirid, int key_type,
1588  u64 *start_ret, u64 *end_ret)
1589 {
1590  struct btrfs_key key;
1591  u64 found_end;
1592  struct btrfs_dir_log_item *item;
1593  int ret;
1594  int nritems;
1595 
1596  if (*start_ret == (u64)-1)
1597  return 1;
1598 
1599  key.objectid = dirid;
1600  key.type = key_type;
1601  key.offset = *start_ret;
1602 
1603  ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1604  if (ret < 0)
1605  goto out;
1606  if (ret > 0) {
1607  if (path->slots[0] == 0)
1608  goto out;
1609  path->slots[0]--;
1610  }
1611  if (ret != 0)
1612  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1613 
1614  if (key.type != key_type || key.objectid != dirid) {
1615  ret = 1;
1616  goto next;
1617  }
1618  item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1619  struct btrfs_dir_log_item);
1620  found_end = btrfs_dir_log_end(path->nodes[0], item);
1621 
1622  if (*start_ret >= key.offset && *start_ret <= found_end) {
1623  ret = 0;
1624  *start_ret = key.offset;
1625  *end_ret = found_end;
1626  goto out;
1627  }
1628  ret = 1;
1629 next:
1630  /* check the next slot in the tree to see if it is a valid item */
1631  nritems = btrfs_header_nritems(path->nodes[0]);
1632  if (path->slots[0] >= nritems) {
1633  ret = btrfs_next_leaf(root, path);
1634  if (ret)
1635  goto out;
1636  } else {
1637  path->slots[0]++;
1638  }
1639 
1640  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1641 
1642  if (key.type != key_type || key.objectid != dirid) {
1643  ret = 1;
1644  goto out;
1645  }
1646  item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1647  struct btrfs_dir_log_item);
1648  found_end = btrfs_dir_log_end(path->nodes[0], item);
1649  *start_ret = key.offset;
1650  *end_ret = found_end;
1651  ret = 0;
1652 out:
1653  btrfs_release_path(path);
1654  return ret;
1655 }
1656 
1657 /*
1658  * this looks for a given directory item in the log. If the directory
1659  * item is not in the log, the item is removed and the inode it points
1660  * to is unlinked
1661  */
1662 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1663  struct btrfs_root *root,
1664  struct btrfs_root *log,
1665  struct btrfs_path *path,
1666  struct btrfs_path *log_path,
1667  struct inode *dir,
1668  struct btrfs_key *dir_key)
1669 {
1670  int ret;
1671  struct extent_buffer *eb;
1672  int slot;
1673  u32 item_size;
1674  struct btrfs_dir_item *di;
1675  struct btrfs_dir_item *log_di;
1676  int name_len;
1677  unsigned long ptr;
1678  unsigned long ptr_end;
1679  char *name;
1680  struct inode *inode;
1681  struct btrfs_key location;
1682 
1683 again:
1684  eb = path->nodes[0];
1685  slot = path->slots[0];
1686  item_size = btrfs_item_size_nr(eb, slot);
1687  ptr = btrfs_item_ptr_offset(eb, slot);
1688  ptr_end = ptr + item_size;
1689  while (ptr < ptr_end) {
1690  di = (struct btrfs_dir_item *)ptr;
1691  if (verify_dir_item(root, eb, di)) {
1692  ret = -EIO;
1693  goto out;
1694  }
1695 
1696  name_len = btrfs_dir_name_len(eb, di);
1697  name = kmalloc(name_len, GFP_NOFS);
1698  if (!name) {
1699  ret = -ENOMEM;
1700  goto out;
1701  }
1702  read_extent_buffer(eb, name, (unsigned long)(di + 1),
1703  name_len);
1704  log_di = NULL;
1705  if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1706  log_di = btrfs_lookup_dir_item(trans, log, log_path,
1707  dir_key->objectid,
1708  name, name_len, 0);
1709  } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1710  log_di = btrfs_lookup_dir_index_item(trans, log,
1711  log_path,
1712  dir_key->objectid,
1713  dir_key->offset,
1714  name, name_len, 0);
1715  }
1716  if (IS_ERR_OR_NULL(log_di)) {
1717  btrfs_dir_item_key_to_cpu(eb, di, &location);
1718  btrfs_release_path(path);
1719  btrfs_release_path(log_path);
1720  inode = read_one_inode(root, location.objectid);
1721  if (!inode) {
1722  kfree(name);
1723  return -EIO;
1724  }
1725 
1726  ret = link_to_fixup_dir(trans, root,
1727  path, location.objectid);
1728  BUG_ON(ret);
1729  btrfs_inc_nlink(inode);
1730  ret = btrfs_unlink_inode(trans, root, dir, inode,
1731  name, name_len);
1732  BUG_ON(ret);
1733 
1734  btrfs_run_delayed_items(trans, root);
1735 
1736  kfree(name);
1737  iput(inode);
1738 
1739  /* there might still be more names under this key
1740  * check and repeat if required
1741  */
1742  ret = btrfs_search_slot(NULL, root, dir_key, path,
1743  0, 0);
1744  if (ret == 0)
1745  goto again;
1746  ret = 0;
1747  goto out;
1748  }
1749  btrfs_release_path(log_path);
1750  kfree(name);
1751 
1752  ptr = (unsigned long)(di + 1);
1753  ptr += name_len;
1754  }
1755  ret = 0;
1756 out:
1757  btrfs_release_path(path);
1758  btrfs_release_path(log_path);
1759  return ret;
1760 }
1761 
1762 /*
1763  * deletion replay happens before we copy any new directory items
1764  * out of the log or out of backreferences from inodes. It
1765  * scans the log to find ranges of keys that log is authoritative for,
1766  * and then scans the directory to find items in those ranges that are
1767  * not present in the log.
1768  *
1769  * Anything we don't find in the log is unlinked and removed from the
1770  * directory.
1771  */
1772 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1773  struct btrfs_root *root,
1774  struct btrfs_root *log,
1775  struct btrfs_path *path,
1776  u64 dirid, int del_all)
1777 {
1778  u64 range_start;
1779  u64 range_end;
1780  int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1781  int ret = 0;
1782  struct btrfs_key dir_key;
1783  struct btrfs_key found_key;
1784  struct btrfs_path *log_path;
1785  struct inode *dir;
1786 
1787  dir_key.objectid = dirid;
1788  dir_key.type = BTRFS_DIR_ITEM_KEY;
1789  log_path = btrfs_alloc_path();
1790  if (!log_path)
1791  return -ENOMEM;
1792 
1793  dir = read_one_inode(root, dirid);
1794  /* it isn't an error if the inode isn't there, that can happen
1795  * because we replay the deletes before we copy in the inode item
1796  * from the log
1797  */
1798  if (!dir) {
1799  btrfs_free_path(log_path);
1800  return 0;
1801  }
1802 again:
1803  range_start = 0;
1804  range_end = 0;
1805  while (1) {
1806  if (del_all)
1807  range_end = (u64)-1;
1808  else {
1809  ret = find_dir_range(log, path, dirid, key_type,
1810  &range_start, &range_end);
1811  if (ret != 0)
1812  break;
1813  }
1814 
1815  dir_key.offset = range_start;
1816  while (1) {
1817  int nritems;
1818  ret = btrfs_search_slot(NULL, root, &dir_key, path,
1819  0, 0);
1820  if (ret < 0)
1821  goto out;
1822 
1823  nritems = btrfs_header_nritems(path->nodes[0]);
1824  if (path->slots[0] >= nritems) {
1825  ret = btrfs_next_leaf(root, path);
1826  if (ret)
1827  break;
1828  }
1829  btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1830  path->slots[0]);
1831  if (found_key.objectid != dirid ||
1832  found_key.type != dir_key.type)
1833  goto next_type;
1834 
1835  if (found_key.offset > range_end)
1836  break;
1837 
1838  ret = check_item_in_log(trans, root, log, path,
1839  log_path, dir,
1840  &found_key);
1841  BUG_ON(ret);
1842  if (found_key.offset == (u64)-1)
1843  break;
1844  dir_key.offset = found_key.offset + 1;
1845  }
1846  btrfs_release_path(path);
1847  if (range_end == (u64)-1)
1848  break;
1849  range_start = range_end + 1;
1850  }
1851 
1852 next_type:
1853  ret = 0;
1854  if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1855  key_type = BTRFS_DIR_LOG_INDEX_KEY;
1856  dir_key.type = BTRFS_DIR_INDEX_KEY;
1857  btrfs_release_path(path);
1858  goto again;
1859  }
1860 out:
1861  btrfs_release_path(path);
1862  btrfs_free_path(log_path);
1863  iput(dir);
1864  return ret;
1865 }
1866 
1867 /*
1868  * the process_func used to replay items from the log tree. This
1869  * gets called in two different stages. The first stage just looks
1870  * for inodes and makes sure they are all copied into the subvolume.
1871  *
1872  * The second stage copies all the other item types from the log into
1873  * the subvolume. The two stage approach is slower, but gets rid of
1874  * lots of complexity around inodes referencing other inodes that exist
1875  * only in the log (references come from either directory items or inode
1876  * back refs).
1877  */
1878 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1879  struct walk_control *wc, u64 gen)
1880 {
1881  int nritems;
1882  struct btrfs_path *path;
1883  struct btrfs_root *root = wc->replay_dest;
1884  struct btrfs_key key;
1885  int level;
1886  int i;
1887  int ret;
1888 
1889  ret = btrfs_read_buffer(eb, gen);
1890  if (ret)
1891  return ret;
1892 
1893  level = btrfs_header_level(eb);
1894 
1895  if (level != 0)
1896  return 0;
1897 
1898  path = btrfs_alloc_path();
1899  if (!path)
1900  return -ENOMEM;
1901 
1902  nritems = btrfs_header_nritems(eb);
1903  for (i = 0; i < nritems; i++) {
1904  btrfs_item_key_to_cpu(eb, &key, i);
1905 
1906  /* inode keys are done during the first stage */
1907  if (key.type == BTRFS_INODE_ITEM_KEY &&
1908  wc->stage == LOG_WALK_REPLAY_INODES) {
1909  struct btrfs_inode_item *inode_item;
1910  u32 mode;
1911 
1912  inode_item = btrfs_item_ptr(eb, i,
1913  struct btrfs_inode_item);
1914  mode = btrfs_inode_mode(eb, inode_item);
1915  if (S_ISDIR(mode)) {
1916  ret = replay_dir_deletes(wc->trans,
1917  root, log, path, key.objectid, 0);
1918  BUG_ON(ret);
1919  }
1920  ret = overwrite_item(wc->trans, root, path,
1921  eb, i, &key);
1922  BUG_ON(ret);
1923 
1924  /* for regular files, make sure corresponding
1925  * orhpan item exist. extents past the new EOF
1926  * will be truncated later by orphan cleanup.
1927  */
1928  if (S_ISREG(mode)) {
1929  ret = insert_orphan_item(wc->trans, root,
1930  key.objectid);
1931  BUG_ON(ret);
1932  }
1933 
1934  ret = link_to_fixup_dir(wc->trans, root,
1935  path, key.objectid);
1936  BUG_ON(ret);
1937  }
1938  if (wc->stage < LOG_WALK_REPLAY_ALL)
1939  continue;
1940 
1941  /* these keys are simply copied */
1942  if (key.type == BTRFS_XATTR_ITEM_KEY) {
1943  ret = overwrite_item(wc->trans, root, path,
1944  eb, i, &key);
1945  BUG_ON(ret);
1946  } else if (key.type == BTRFS_INODE_REF_KEY) {
1947  ret = add_inode_ref(wc->trans, root, log, path,
1948  eb, i, &key);
1949  BUG_ON(ret && ret != -ENOENT);
1950  } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
1951  ret = add_inode_ref(wc->trans, root, log, path,
1952  eb, i, &key);
1953  BUG_ON(ret && ret != -ENOENT);
1954  } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1955  ret = replay_one_extent(wc->trans, root, path,
1956  eb, i, &key);
1957  BUG_ON(ret);
1958  } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1959  key.type == BTRFS_DIR_INDEX_KEY) {
1960  ret = replay_one_dir_item(wc->trans, root, path,
1961  eb, i, &key);
1962  BUG_ON(ret);
1963  }
1964  }
1965  btrfs_free_path(path);
1966  return 0;
1967 }
1968 
1969 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1970  struct btrfs_root *root,
1971  struct btrfs_path *path, int *level,
1972  struct walk_control *wc)
1973 {
1974  u64 root_owner;
1975  u64 bytenr;
1976  u64 ptr_gen;
1977  struct extent_buffer *next;
1978  struct extent_buffer *cur;
1979  struct extent_buffer *parent;
1980  u32 blocksize;
1981  int ret = 0;
1982 
1983  WARN_ON(*level < 0);
1984  WARN_ON(*level >= BTRFS_MAX_LEVEL);
1985 
1986  while (*level > 0) {
1987  WARN_ON(*level < 0);
1988  WARN_ON(*level >= BTRFS_MAX_LEVEL);
1989  cur = path->nodes[*level];
1990 
1991  if (btrfs_header_level(cur) != *level)
1992  WARN_ON(1);
1993 
1994  if (path->slots[*level] >=
1995  btrfs_header_nritems(cur))
1996  break;
1997 
1998  bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1999  ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2000  blocksize = btrfs_level_size(root, *level - 1);
2001 
2002  parent = path->nodes[*level];
2003  root_owner = btrfs_header_owner(parent);
2004 
2005  next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2006  if (!next)
2007  return -ENOMEM;
2008 
2009  if (*level == 1) {
2010  ret = wc->process_func(root, next, wc, ptr_gen);
2011  if (ret)
2012  return ret;
2013 
2014  path->slots[*level]++;
2015  if (wc->free) {
2016  ret = btrfs_read_buffer(next, ptr_gen);
2017  if (ret) {
2018  free_extent_buffer(next);
2019  return ret;
2020  }
2021 
2022  btrfs_tree_lock(next);
2023  btrfs_set_lock_blocking(next);
2024  clean_tree_block(trans, root, next);
2026  btrfs_tree_unlock(next);
2027 
2028  WARN_ON(root_owner !=
2031  bytenr, blocksize);
2032  BUG_ON(ret); /* -ENOMEM or logic errors */
2033  }
2034  free_extent_buffer(next);
2035  continue;
2036  }
2037  ret = btrfs_read_buffer(next, ptr_gen);
2038  if (ret) {
2039  free_extent_buffer(next);
2040  return ret;
2041  }
2042 
2043  WARN_ON(*level <= 0);
2044  if (path->nodes[*level-1])
2045  free_extent_buffer(path->nodes[*level-1]);
2046  path->nodes[*level-1] = next;
2047  *level = btrfs_header_level(next);
2048  path->slots[*level] = 0;
2049  cond_resched();
2050  }
2051  WARN_ON(*level < 0);
2052  WARN_ON(*level >= BTRFS_MAX_LEVEL);
2053 
2054  path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2055 
2056  cond_resched();
2057  return 0;
2058 }
2059 
2060 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2061  struct btrfs_root *root,
2062  struct btrfs_path *path, int *level,
2063  struct walk_control *wc)
2064 {
2065  u64 root_owner;
2066  int i;
2067  int slot;
2068  int ret;
2069 
2070  for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2071  slot = path->slots[i];
2072  if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2073  path->slots[i]++;
2074  *level = i;
2075  WARN_ON(*level == 0);
2076  return 0;
2077  } else {
2078  struct extent_buffer *parent;
2079  if (path->nodes[*level] == root->node)
2080  parent = path->nodes[*level];
2081  else
2082  parent = path->nodes[*level + 1];
2083 
2084  root_owner = btrfs_header_owner(parent);
2085  ret = wc->process_func(root, path->nodes[*level], wc,
2086  btrfs_header_generation(path->nodes[*level]));
2087  if (ret)
2088  return ret;
2089 
2090  if (wc->free) {
2091  struct extent_buffer *next;
2092 
2093  next = path->nodes[*level];
2094 
2095  btrfs_tree_lock(next);
2096  btrfs_set_lock_blocking(next);
2097  clean_tree_block(trans, root, next);
2099  btrfs_tree_unlock(next);
2100 
2101  WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2103  path->nodes[*level]->start,
2104  path->nodes[*level]->len);
2105  BUG_ON(ret);
2106  }
2107  free_extent_buffer(path->nodes[*level]);
2108  path->nodes[*level] = NULL;
2109  *level = i + 1;
2110  }
2111  }
2112  return 1;
2113 }
2114 
2115 /*
2116  * drop the reference count on the tree rooted at 'snap'. This traverses
2117  * the tree freeing any blocks that have a ref count of zero after being
2118  * decremented.
2119  */
2120 static int walk_log_tree(struct btrfs_trans_handle *trans,
2121  struct btrfs_root *log, struct walk_control *wc)
2122 {
2123  int ret = 0;
2124  int wret;
2125  int level;
2126  struct btrfs_path *path;
2127  int i;
2128  int orig_level;
2129 
2130  path = btrfs_alloc_path();
2131  if (!path)
2132  return -ENOMEM;
2133 
2134  level = btrfs_header_level(log->node);
2135  orig_level = level;
2136  path->nodes[level] = log->node;
2137  extent_buffer_get(log->node);
2138  path->slots[level] = 0;
2139 
2140  while (1) {
2141  wret = walk_down_log_tree(trans, log, path, &level, wc);
2142  if (wret > 0)
2143  break;
2144  if (wret < 0) {
2145  ret = wret;
2146  goto out;
2147  }
2148 
2149  wret = walk_up_log_tree(trans, log, path, &level, wc);
2150  if (wret > 0)
2151  break;
2152  if (wret < 0) {
2153  ret = wret;
2154  goto out;
2155  }
2156  }
2157 
2158  /* was the root node processed? if not, catch it here */
2159  if (path->nodes[orig_level]) {
2160  ret = wc->process_func(log, path->nodes[orig_level], wc,
2161  btrfs_header_generation(path->nodes[orig_level]));
2162  if (ret)
2163  goto out;
2164  if (wc->free) {
2165  struct extent_buffer *next;
2166 
2167  next = path->nodes[orig_level];
2168 
2169  btrfs_tree_lock(next);
2170  btrfs_set_lock_blocking(next);
2171  clean_tree_block(trans, log, next);
2173  btrfs_tree_unlock(next);
2174 
2175  WARN_ON(log->root_key.objectid !=
2177  ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2178  next->len);
2179  BUG_ON(ret); /* -ENOMEM or logic errors */
2180  }
2181  }
2182 
2183 out:
2184  for (i = 0; i <= orig_level; i++) {
2185  if (path->nodes[i]) {
2186  free_extent_buffer(path->nodes[i]);
2187  path->nodes[i] = NULL;
2188  }
2189  }
2190  btrfs_free_path(path);
2191  return ret;
2192 }
2193 
2194 /*
2195  * helper function to update the item for a given subvolumes log root
2196  * in the tree of log roots
2197  */
2198 static int update_log_root(struct btrfs_trans_handle *trans,
2199  struct btrfs_root *log)
2200 {
2201  int ret;
2202 
2203  if (log->log_transid == 1) {
2204  /* insert root item on the first sync */
2205  ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2206  &log->root_key, &log->root_item);
2207  } else {
2208  ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2209  &log->root_key, &log->root_item);
2210  }
2211  return ret;
2212 }
2213 
2214 static int wait_log_commit(struct btrfs_trans_handle *trans,
2215  struct btrfs_root *root, unsigned long transid)
2216 {
2217  DEFINE_WAIT(wait);
2218  int index = transid % 2;
2219 
2220  /*
2221  * we only allow two pending log transactions at a time,
2222  * so we know that if ours is more than 2 older than the
2223  * current transaction, we're done
2224  */
2225  do {
2226  prepare_to_wait(&root->log_commit_wait[index],
2228  mutex_unlock(&root->log_mutex);
2229 
2230  if (root->fs_info->last_trans_log_full_commit !=
2231  trans->transid && root->log_transid < transid + 2 &&
2232  atomic_read(&root->log_commit[index]))
2233  schedule();
2234 
2235  finish_wait(&root->log_commit_wait[index], &wait);
2236  mutex_lock(&root->log_mutex);
2237  } while (root->fs_info->last_trans_log_full_commit !=
2238  trans->transid && root->log_transid < transid + 2 &&
2239  atomic_read(&root->log_commit[index]));
2240  return 0;
2241 }
2242 
2243 static void wait_for_writer(struct btrfs_trans_handle *trans,
2244  struct btrfs_root *root)
2245 {
2246  DEFINE_WAIT(wait);
2247  while (root->fs_info->last_trans_log_full_commit !=
2248  trans->transid && atomic_read(&root->log_writers)) {
2251  mutex_unlock(&root->log_mutex);
2252  if (root->fs_info->last_trans_log_full_commit !=
2253  trans->transid && atomic_read(&root->log_writers))
2254  schedule();
2255  mutex_lock(&root->log_mutex);
2256  finish_wait(&root->log_writer_wait, &wait);
2257  }
2258 }
2259 
2260 /*
2261  * btrfs_sync_log does sends a given tree log down to the disk and
2262  * updates the super blocks to record it. When this call is done,
2263  * you know that any inodes previously logged are safely on disk only
2264  * if it returns 0.
2265  *
2266  * Any other return value means you need to call btrfs_commit_transaction.
2267  * Some of the edge cases for fsyncing directories that have had unlinks
2268  * or renames done in the past mean that sometimes the only safe
2269  * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
2270  * that has happened.
2271  */
2273  struct btrfs_root *root)
2274 {
2275  int index1;
2276  int index2;
2277  int mark;
2278  int ret;
2279  struct btrfs_root *log = root->log_root;
2280  struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2281  unsigned long log_transid = 0;
2282 
2283  mutex_lock(&root->log_mutex);
2284  index1 = root->log_transid % 2;
2285  if (atomic_read(&root->log_commit[index1])) {
2286  wait_log_commit(trans, root, root->log_transid);
2287  mutex_unlock(&root->log_mutex);
2288  return 0;
2289  }
2290  atomic_set(&root->log_commit[index1], 1);
2291 
2292  /* wait for previous tree log sync to complete */
2293  if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2294  wait_log_commit(trans, root, root->log_transid - 1);
2295  while (1) {
2296  int batch = atomic_read(&root->log_batch);
2297  /* when we're on an ssd, just kick the log commit out */
2298  if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2299  mutex_unlock(&root->log_mutex);
2301  mutex_lock(&root->log_mutex);
2302  }
2303  wait_for_writer(trans, root);
2304  if (batch == atomic_read(&root->log_batch))
2305  break;
2306  }
2307 
2308  /* bail out if we need to do a full commit */
2309  if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2310  ret = -EAGAIN;
2311  mutex_unlock(&root->log_mutex);
2312  goto out;
2313  }
2314 
2315  log_transid = root->log_transid;
2316  if (log_transid % 2 == 0)
2317  mark = EXTENT_DIRTY;
2318  else
2319  mark = EXTENT_NEW;
2320 
2321  /* we start IO on all the marked extents here, but we don't actually
2322  * wait for them until later.
2323  */
2324  ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2325  if (ret) {
2326  btrfs_abort_transaction(trans, root, ret);
2327  mutex_unlock(&root->log_mutex);
2328  goto out;
2329  }
2330 
2331  btrfs_set_root_node(&log->root_item, log->node);
2332 
2333  root->log_transid++;
2334  log->log_transid = root->log_transid;
2335  root->log_start_pid = 0;
2336  smp_mb();
2337  /*
2338  * IO has been started, blocks of the log tree have WRITTEN flag set
2339  * in their headers. new modifications of the log will be written to
2340  * new positions. so it's safe to allow log writers to go in.
2341  */
2342  mutex_unlock(&root->log_mutex);
2343 
2344  mutex_lock(&log_root_tree->log_mutex);
2345  atomic_inc(&log_root_tree->log_batch);
2346  atomic_inc(&log_root_tree->log_writers);
2347  mutex_unlock(&log_root_tree->log_mutex);
2348 
2349  ret = update_log_root(trans, log);
2350 
2351  mutex_lock(&log_root_tree->log_mutex);
2352  if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2353  smp_mb();
2354  if (waitqueue_active(&log_root_tree->log_writer_wait))
2355  wake_up(&log_root_tree->log_writer_wait);
2356  }
2357 
2358  if (ret) {
2359  if (ret != -ENOSPC) {
2360  btrfs_abort_transaction(trans, root, ret);
2361  mutex_unlock(&log_root_tree->log_mutex);
2362  goto out;
2363  }
2364  root->fs_info->last_trans_log_full_commit = trans->transid;
2365  btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2366  mutex_unlock(&log_root_tree->log_mutex);
2367  ret = -EAGAIN;
2368  goto out;
2369  }
2370 
2371  index2 = log_root_tree->log_transid % 2;
2372  if (atomic_read(&log_root_tree->log_commit[index2])) {
2373  btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2374  wait_log_commit(trans, log_root_tree,
2375  log_root_tree->log_transid);
2376  mutex_unlock(&log_root_tree->log_mutex);
2377  ret = 0;
2378  goto out;
2379  }
2380  atomic_set(&log_root_tree->log_commit[index2], 1);
2381 
2382  if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2383  wait_log_commit(trans, log_root_tree,
2384  log_root_tree->log_transid - 1);
2385  }
2386 
2387  wait_for_writer(trans, log_root_tree);
2388 
2389  /*
2390  * now that we've moved on to the tree of log tree roots,
2391  * check the full commit flag again
2392  */
2393  if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2394  btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2395  mutex_unlock(&log_root_tree->log_mutex);
2396  ret = -EAGAIN;
2397  goto out_wake_log_root;
2398  }
2399 
2400  ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2401  &log_root_tree->dirty_log_pages,
2403  if (ret) {
2404  btrfs_abort_transaction(trans, root, ret);
2405  mutex_unlock(&log_root_tree->log_mutex);
2406  goto out_wake_log_root;
2407  }
2408  btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2409 
2410  btrfs_set_super_log_root(root->fs_info->super_for_commit,
2411  log_root_tree->node->start);
2412  btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2413  btrfs_header_level(log_root_tree->node));
2414 
2415  log_root_tree->log_transid++;
2416  smp_mb();
2417 
2418  mutex_unlock(&log_root_tree->log_mutex);
2419 
2420  /*
2421  * nobody else is going to jump in and write the the ctree
2422  * super here because the log_commit atomic below is protecting
2423  * us. We must be called with a transaction handle pinning
2424  * the running transaction open, so a full commit can't hop
2425  * in and cause problems either.
2426  */
2428  ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2430  if (ret) {
2431  btrfs_abort_transaction(trans, root, ret);
2432  goto out_wake_log_root;
2433  }
2434 
2435  mutex_lock(&root->log_mutex);
2436  if (root->last_log_commit < log_transid)
2437  root->last_log_commit = log_transid;
2438  mutex_unlock(&root->log_mutex);
2439 
2440 out_wake_log_root:
2441  atomic_set(&log_root_tree->log_commit[index2], 0);
2442  smp_mb();
2443  if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2444  wake_up(&log_root_tree->log_commit_wait[index2]);
2445 out:
2446  atomic_set(&root->log_commit[index1], 0);
2447  smp_mb();
2448  if (waitqueue_active(&root->log_commit_wait[index1]))
2449  wake_up(&root->log_commit_wait[index1]);
2450  return ret;
2451 }
2452 
2453 static void free_log_tree(struct btrfs_trans_handle *trans,
2454  struct btrfs_root *log)
2455 {
2456  int ret;
2457  u64 start;
2458  u64 end;
2459  struct walk_control wc = {
2460  .free = 1,
2461  .process_func = process_one_buffer
2462  };
2463 
2464  ret = walk_log_tree(trans, log, &wc);
2465  BUG_ON(ret);
2466 
2467  while (1) {
2469  0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2470  NULL);
2471  if (ret)
2472  break;
2473 
2474  clear_extent_bits(&log->dirty_log_pages, start, end,
2476  }
2477 
2478  free_extent_buffer(log->node);
2479  kfree(log);
2480 }
2481 
2482 /*
2483  * free all the extents used by the tree log. This should be called
2484  * at commit time of the full transaction
2485  */
2486 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2487 {
2488  if (root->log_root) {
2489  free_log_tree(trans, root->log_root);
2490  root->log_root = NULL;
2491  }
2492  return 0;
2493 }
2494 
2496  struct btrfs_fs_info *fs_info)
2497 {
2498  if (fs_info->log_root_tree) {
2499  free_log_tree(trans, fs_info->log_root_tree);
2500  fs_info->log_root_tree = NULL;
2501  }
2502  return 0;
2503 }
2504 
2505 /*
2506  * If both a file and directory are logged, and unlinks or renames are
2507  * mixed in, we have a few interesting corners:
2508  *
2509  * create file X in dir Y
2510  * link file X to X.link in dir Y
2511  * fsync file X
2512  * unlink file X but leave X.link
2513  * fsync dir Y
2514  *
2515  * After a crash we would expect only X.link to exist. But file X
2516  * didn't get fsync'd again so the log has back refs for X and X.link.
2517  *
2518  * We solve this by removing directory entries and inode backrefs from the
2519  * log when a file that was logged in the current transaction is
2520  * unlinked. Any later fsync will include the updated log entries, and
2521  * we'll be able to reconstruct the proper directory items from backrefs.
2522  *
2523  * This optimizations allows us to avoid relogging the entire inode
2524  * or the entire directory.
2525  */
2527  struct btrfs_root *root,
2528  const char *name, int name_len,
2529  struct inode *dir, u64 index)
2530 {
2531  struct btrfs_root *log;
2532  struct btrfs_dir_item *di;
2533  struct btrfs_path *path;
2534  int ret;
2535  int err = 0;
2536  int bytes_del = 0;
2537  u64 dir_ino = btrfs_ino(dir);
2538 
2539  if (BTRFS_I(dir)->logged_trans < trans->transid)
2540  return 0;
2541 
2542  ret = join_running_log_trans(root);
2543  if (ret)
2544  return 0;
2545 
2546  mutex_lock(&BTRFS_I(dir)->log_mutex);
2547 
2548  log = root->log_root;
2549  path = btrfs_alloc_path();
2550  if (!path) {
2551  err = -ENOMEM;
2552  goto out_unlock;
2553  }
2554 
2555  di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2556  name, name_len, -1);
2557  if (IS_ERR(di)) {
2558  err = PTR_ERR(di);
2559  goto fail;
2560  }
2561  if (di) {
2562  ret = btrfs_delete_one_dir_name(trans, log, path, di);
2563  bytes_del += name_len;
2564  BUG_ON(ret);
2565  }
2566  btrfs_release_path(path);
2567  di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2568  index, name, name_len, -1);
2569  if (IS_ERR(di)) {
2570  err = PTR_ERR(di);
2571  goto fail;
2572  }
2573  if (di) {
2574  ret = btrfs_delete_one_dir_name(trans, log, path, di);
2575  bytes_del += name_len;
2576  BUG_ON(ret);
2577  }
2578 
2579  /* update the directory size in the log to reflect the names
2580  * we have removed
2581  */
2582  if (bytes_del) {
2583  struct btrfs_key key;
2584 
2585  key.objectid = dir_ino;
2586  key.offset = 0;
2587  key.type = BTRFS_INODE_ITEM_KEY;
2588  btrfs_release_path(path);
2589 
2590  ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2591  if (ret < 0) {
2592  err = ret;
2593  goto fail;
2594  }
2595  if (ret == 0) {
2596  struct btrfs_inode_item *item;
2597  u64 i_size;
2598 
2599  item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2600  struct btrfs_inode_item);
2601  i_size = btrfs_inode_size(path->nodes[0], item);
2602  if (i_size > bytes_del)
2603  i_size -= bytes_del;
2604  else
2605  i_size = 0;
2606  btrfs_set_inode_size(path->nodes[0], item, i_size);
2607  btrfs_mark_buffer_dirty(path->nodes[0]);
2608  } else
2609  ret = 0;
2610  btrfs_release_path(path);
2611  }
2612 fail:
2613  btrfs_free_path(path);
2614 out_unlock:
2615  mutex_unlock(&BTRFS_I(dir)->log_mutex);
2616  if (ret == -ENOSPC) {
2617  root->fs_info->last_trans_log_full_commit = trans->transid;
2618  ret = 0;
2619  } else if (ret < 0)
2620  btrfs_abort_transaction(trans, root, ret);
2621 
2622  btrfs_end_log_trans(root);
2623 
2624  return err;
2625 }
2626 
2627 /* see comments for btrfs_del_dir_entries_in_log */
2629  struct btrfs_root *root,
2630  const char *name, int name_len,
2631  struct inode *inode, u64 dirid)
2632 {
2633  struct btrfs_root *log;
2634  u64 index;
2635  int ret;
2636 
2637  if (BTRFS_I(inode)->logged_trans < trans->transid)
2638  return 0;
2639 
2640  ret = join_running_log_trans(root);
2641  if (ret)
2642  return 0;
2643  log = root->log_root;
2644  mutex_lock(&BTRFS_I(inode)->log_mutex);
2645 
2646  ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2647  dirid, &index);
2648  mutex_unlock(&BTRFS_I(inode)->log_mutex);
2649  if (ret == -ENOSPC) {
2650  root->fs_info->last_trans_log_full_commit = trans->transid;
2651  ret = 0;
2652  } else if (ret < 0 && ret != -ENOENT)
2653  btrfs_abort_transaction(trans, root, ret);
2654  btrfs_end_log_trans(root);
2655 
2656  return ret;
2657 }
2658 
2659 /*
2660  * creates a range item in the log for 'dirid'. first_offset and
2661  * last_offset tell us which parts of the key space the log should
2662  * be considered authoritative for.
2663  */
2664 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2665  struct btrfs_root *log,
2666  struct btrfs_path *path,
2667  int key_type, u64 dirid,
2668  u64 first_offset, u64 last_offset)
2669 {
2670  int ret;
2671  struct btrfs_key key;
2672  struct btrfs_dir_log_item *item;
2673 
2674  key.objectid = dirid;
2675  key.offset = first_offset;
2676  if (key_type == BTRFS_DIR_ITEM_KEY)
2678  else
2680  ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2681  if (ret)
2682  return ret;
2683 
2684  item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2685  struct btrfs_dir_log_item);
2686  btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2687  btrfs_mark_buffer_dirty(path->nodes[0]);
2688  btrfs_release_path(path);
2689  return 0;
2690 }
2691 
2692 /*
2693  * log all the items included in the current transaction for a given
2694  * directory. This also creates the range items in the log tree required
2695  * to replay anything deleted before the fsync
2696  */
2697 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2698  struct btrfs_root *root, struct inode *inode,
2699  struct btrfs_path *path,
2700  struct btrfs_path *dst_path, int key_type,
2701  u64 min_offset, u64 *last_offset_ret)
2702 {
2703  struct btrfs_key min_key;
2704  struct btrfs_key max_key;
2705  struct btrfs_root *log = root->log_root;
2706  struct extent_buffer *src;
2707  int err = 0;
2708  int ret;
2709  int i;
2710  int nritems;
2711  u64 first_offset = min_offset;
2712  u64 last_offset = (u64)-1;
2713  u64 ino = btrfs_ino(inode);
2714 
2715  log = root->log_root;
2716  max_key.objectid = ino;
2717  max_key.offset = (u64)-1;
2718  max_key.type = key_type;
2719 
2720  min_key.objectid = ino;
2721  min_key.type = key_type;
2722  min_key.offset = min_offset;
2723 
2724  path->keep_locks = 1;
2725 
2726  ret = btrfs_search_forward(root, &min_key, &max_key,
2727  path, 0, trans->transid);
2728 
2729  /*
2730  * we didn't find anything from this transaction, see if there
2731  * is anything at all
2732  */
2733  if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2734  min_key.objectid = ino;
2735  min_key.type = key_type;
2736  min_key.offset = (u64)-1;
2737  btrfs_release_path(path);
2738  ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2739  if (ret < 0) {
2740  btrfs_release_path(path);
2741  return ret;
2742  }
2743  ret = btrfs_previous_item(root, path, ino, key_type);
2744 
2745  /* if ret == 0 there are items for this type,
2746  * create a range to tell us the last key of this type.
2747  * otherwise, there are no items in this directory after
2748  * *min_offset, and we create a range to indicate that.
2749  */
2750  if (ret == 0) {
2751  struct btrfs_key tmp;
2752  btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2753  path->slots[0]);
2754  if (key_type == tmp.type)
2755  first_offset = max(min_offset, tmp.offset) + 1;
2756  }
2757  goto done;
2758  }
2759 
2760  /* go backward to find any previous key */
2761  ret = btrfs_previous_item(root, path, ino, key_type);
2762  if (ret == 0) {
2763  struct btrfs_key tmp;
2764  btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2765  if (key_type == tmp.type) {
2766  first_offset = tmp.offset;
2767  ret = overwrite_item(trans, log, dst_path,
2768  path->nodes[0], path->slots[0],
2769  &tmp);
2770  if (ret) {
2771  err = ret;
2772  goto done;
2773  }
2774  }
2775  }
2776  btrfs_release_path(path);
2777 
2778  /* find the first key from this transaction again */
2779  ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2780  if (ret != 0) {
2781  WARN_ON(1);
2782  goto done;
2783  }
2784 
2785  /*
2786  * we have a block from this transaction, log every item in it
2787  * from our directory
2788  */
2789  while (1) {
2790  struct btrfs_key tmp;
2791  src = path->nodes[0];
2792  nritems = btrfs_header_nritems(src);
2793  for (i = path->slots[0]; i < nritems; i++) {
2794  btrfs_item_key_to_cpu(src, &min_key, i);
2795 
2796  if (min_key.objectid != ino || min_key.type != key_type)
2797  goto done;
2798  ret = overwrite_item(trans, log, dst_path, src, i,
2799  &min_key);
2800  if (ret) {
2801  err = ret;
2802  goto done;
2803  }
2804  }
2805  path->slots[0] = nritems;
2806 
2807  /*
2808  * look ahead to the next item and see if it is also
2809  * from this directory and from this transaction
2810  */
2811  ret = btrfs_next_leaf(root, path);
2812  if (ret == 1) {
2813  last_offset = (u64)-1;
2814  goto done;
2815  }
2816  btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2817  if (tmp.objectid != ino || tmp.type != key_type) {
2818  last_offset = (u64)-1;
2819  goto done;
2820  }
2821  if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2822  ret = overwrite_item(trans, log, dst_path,
2823  path->nodes[0], path->slots[0],
2824  &tmp);
2825  if (ret)
2826  err = ret;
2827  else
2828  last_offset = tmp.offset;
2829  goto done;
2830  }
2831  }
2832 done:
2833  btrfs_release_path(path);
2834  btrfs_release_path(dst_path);
2835 
2836  if (err == 0) {
2837  *last_offset_ret = last_offset;
2838  /*
2839  * insert the log range keys to indicate where the log
2840  * is valid
2841  */
2842  ret = insert_dir_log_key(trans, log, path, key_type,
2843  ino, first_offset, last_offset);
2844  if (ret)
2845  err = ret;
2846  }
2847  return err;
2848 }
2849 
2850 /*
2851  * logging directories is very similar to logging inodes, We find all the items
2852  * from the current transaction and write them to the log.
2853  *
2854  * The recovery code scans the directory in the subvolume, and if it finds a
2855  * key in the range logged that is not present in the log tree, then it means
2856  * that dir entry was unlinked during the transaction.
2857  *
2858  * In order for that scan to work, we must include one key smaller than
2859  * the smallest logged by this transaction and one key larger than the largest
2860  * key logged by this transaction.
2861  */
2862 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2863  struct btrfs_root *root, struct inode *inode,
2864  struct btrfs_path *path,
2865  struct btrfs_path *dst_path)
2866 {
2867  u64 min_key;
2868  u64 max_key;
2869  int ret;
2870  int key_type = BTRFS_DIR_ITEM_KEY;
2871 
2872 again:
2873  min_key = 0;
2874  max_key = 0;
2875  while (1) {
2876  ret = log_dir_items(trans, root, inode, path,
2877  dst_path, key_type, min_key,
2878  &max_key);
2879  if (ret)
2880  return ret;
2881  if (max_key == (u64)-1)
2882  break;
2883  min_key = max_key + 1;
2884  }
2885 
2886  if (key_type == BTRFS_DIR_ITEM_KEY) {
2887  key_type = BTRFS_DIR_INDEX_KEY;
2888  goto again;
2889  }
2890  return 0;
2891 }
2892 
2893 /*
2894  * a helper function to drop items from the log before we relog an
2895  * inode. max_key_type indicates the highest item type to remove.
2896  * This cannot be run for file data extents because it does not
2897  * free the extents they point to.
2898  */
2899 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2900  struct btrfs_root *log,
2901  struct btrfs_path *path,
2902  u64 objectid, int max_key_type)
2903 {
2904  int ret;
2905  struct btrfs_key key;
2906  struct btrfs_key found_key;
2907  int start_slot;
2908 
2909  key.objectid = objectid;
2910  key.type = max_key_type;
2911  key.offset = (u64)-1;
2912 
2913  while (1) {
2914  ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2915  BUG_ON(ret == 0);
2916  if (ret < 0)
2917  break;
2918 
2919  if (path->slots[0] == 0)
2920  break;
2921 
2922  path->slots[0]--;
2923  btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2924  path->slots[0]);
2925 
2926  if (found_key.objectid != objectid)
2927  break;
2928 
2929  found_key.offset = 0;
2930  found_key.type = 0;
2931  ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
2932  &start_slot);
2933 
2934  ret = btrfs_del_items(trans, log, path, start_slot,
2935  path->slots[0] - start_slot + 1);
2936  /*
2937  * If start slot isn't 0 then we don't need to re-search, we've
2938  * found the last guy with the objectid in this tree.
2939  */
2940  if (ret || start_slot != 0)
2941  break;
2942  btrfs_release_path(path);
2943  }
2944  btrfs_release_path(path);
2945  if (ret > 0)
2946  ret = 0;
2947  return ret;
2948 }
2949 
2950 static void fill_inode_item(struct btrfs_trans_handle *trans,
2951  struct extent_buffer *leaf,
2952  struct btrfs_inode_item *item,
2953  struct inode *inode, int log_inode_only)
2954 {
2955  btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
2956  btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2957  btrfs_set_inode_mode(leaf, item, inode->i_mode);
2958  btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959 
2960  btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961  inode->i_atime.tv_sec);
2962  btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963  inode->i_atime.tv_nsec);
2964 
2965  btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966  inode->i_mtime.tv_sec);
2967  btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968  inode->i_mtime.tv_nsec);
2969 
2970  btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971  inode->i_ctime.tv_sec);
2972  btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973  inode->i_ctime.tv_nsec);
2974 
2975  btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976 
2977  btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978  btrfs_set_inode_transid(leaf, item, trans->transid);
2979  btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980  btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981  btrfs_set_inode_block_group(leaf, item, 0);
2982 
2983  if (log_inode_only) {
2984  /* set the generation to zero so the recover code
2985  * can tell the difference between an logging
2986  * just to say 'this inode exists' and a logging
2987  * to say 'update this inode with these values'
2988  */
2989  btrfs_set_inode_generation(leaf, item, 0);
2990  btrfs_set_inode_size(leaf, item, 0);
2991  } else {
2992  btrfs_set_inode_generation(leaf, item,
2993  BTRFS_I(inode)->generation);
2994  btrfs_set_inode_size(leaf, item, inode->i_size);
2995  }
2996 
2997 }
2998 
2999 static noinline int copy_items(struct btrfs_trans_handle *trans,
3000  struct inode *inode,
3001  struct btrfs_path *dst_path,
3002  struct extent_buffer *src,
3003  int start_slot, int nr, int inode_only)
3004 {
3005  unsigned long src_offset;
3006  unsigned long dst_offset;
3007  struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3009  struct btrfs_inode_item *inode_item;
3010  int ret;
3011  struct btrfs_key *ins_keys;
3012  u32 *ins_sizes;
3013  char *ins_data;
3014  int i;
3015  struct list_head ordered_sums;
3016  int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3017 
3018  INIT_LIST_HEAD(&ordered_sums);
3019 
3020  ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3021  nr * sizeof(u32), GFP_NOFS);
3022  if (!ins_data)
3023  return -ENOMEM;
3024 
3025  ins_sizes = (u32 *)ins_data;
3026  ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3027 
3028  for (i = 0; i < nr; i++) {
3029  ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3030  btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3031  }
3032  ret = btrfs_insert_empty_items(trans, log, dst_path,
3033  ins_keys, ins_sizes, nr);
3034  if (ret) {
3035  kfree(ins_data);
3036  return ret;
3037  }
3038 
3039  for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3040  dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3041  dst_path->slots[0]);
3042 
3043  src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3044 
3045  if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3046  inode_item = btrfs_item_ptr(dst_path->nodes[0],
3047  dst_path->slots[0],
3048  struct btrfs_inode_item);
3049  fill_inode_item(trans, dst_path->nodes[0], inode_item,
3050  inode, inode_only == LOG_INODE_EXISTS);
3051  } else {
3052  copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3053  src_offset, ins_sizes[i]);
3054  }
3055 
3056  /* take a reference on file data extents so that truncates
3057  * or deletes of this inode don't have to relog the inode
3058  * again
3059  */
3060  if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3061  !skip_csum) {
3062  int found_type;
3063  extent = btrfs_item_ptr(src, start_slot + i,
3064  struct btrfs_file_extent_item);
3065 
3066  if (btrfs_file_extent_generation(src, extent) < trans->transid)
3067  continue;
3068 
3069  found_type = btrfs_file_extent_type(src, extent);
3070  if (found_type == BTRFS_FILE_EXTENT_REG) {
3071  u64 ds, dl, cs, cl;
3072  ds = btrfs_file_extent_disk_bytenr(src,
3073  extent);
3074  /* ds == 0 is a hole */
3075  if (ds == 0)
3076  continue;
3077 
3078  dl = btrfs_file_extent_disk_num_bytes(src,
3079  extent);
3080  cs = btrfs_file_extent_offset(src, extent);
3081  cl = btrfs_file_extent_num_bytes(src,
3082  extent);
3083  if (btrfs_file_extent_compression(src,
3084  extent)) {
3085  cs = 0;
3086  cl = dl;
3087  }
3088 
3090  log->fs_info->csum_root,
3091  ds + cs, ds + cs + cl - 1,
3092  &ordered_sums, 0);
3093  BUG_ON(ret);
3094  }
3095  }
3096  }
3097 
3098  btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3099  btrfs_release_path(dst_path);
3100  kfree(ins_data);
3101 
3102  /*
3103  * we have to do this after the loop above to avoid changing the
3104  * log tree while trying to change the log tree.
3105  */
3106  ret = 0;
3107  while (!list_empty(&ordered_sums)) {
3108  struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3109  struct btrfs_ordered_sum,
3110  list);
3111  if (!ret)
3112  ret = btrfs_csum_file_blocks(trans, log, sums);
3113  list_del(&sums->list);
3114  kfree(sums);
3115  }
3116  return ret;
3117 }
3118 
3119 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3120 {
3121  struct extent_map *em1, *em2;
3122 
3123  em1 = list_entry(a, struct extent_map, list);
3124  em2 = list_entry(b, struct extent_map, list);
3125 
3126  if (em1->start < em2->start)
3127  return -1;
3128  else if (em1->start > em2->start)
3129  return 1;
3130  return 0;
3131 }
3132 
3133 struct log_args {
3137  int nr;
3138 };
3139 
3140 static int log_one_extent(struct btrfs_trans_handle *trans,
3141  struct inode *inode, struct btrfs_root *root,
3142  struct extent_map *em, struct btrfs_path *path,
3143  struct btrfs_path *dst_path, struct log_args *args)
3144 {
3145  struct btrfs_root *log = root->log_root;
3146  struct btrfs_file_extent_item *fi;
3147  struct btrfs_key key;
3148  u64 start = em->mod_start;
3149  u64 search_start = start;
3150  u64 len = em->mod_len;
3151  u64 num_bytes;
3152  int nritems;
3153  int ret;
3154 
3155  if (BTRFS_I(inode)->logged_trans == trans->transid) {
3156  ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
3157  start + len, NULL, 0);
3158  if (ret)
3159  return ret;
3160  }
3161 
3162  while (len) {
3163  if (args->nr)
3164  goto next_slot;
3165 again:
3166  key.objectid = btrfs_ino(inode);
3168  key.offset = search_start;
3169 
3170  ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3171  if (ret < 0)
3172  return ret;
3173 
3174  if (ret) {
3175  /*
3176  * A rare case were we can have an em for a section of a
3177  * larger extent so we need to make sure that this em
3178  * falls within the extent we've found. If not we just
3179  * bail and go back to ye-olde way of doing things but
3180  * it happens often enough in testing that we need to do
3181  * this dance to make sure.
3182  */
3183  do {
3184  if (path->slots[0] == 0) {
3185  btrfs_release_path(path);
3186  if (search_start == 0)
3187  return -ENOENT;
3188  search_start--;
3189  goto again;
3190  }
3191 
3192  path->slots[0]--;
3193  btrfs_item_key_to_cpu(path->nodes[0], &key,
3194  path->slots[0]);
3195  if (key.objectid != btrfs_ino(inode) ||
3196  key.type != BTRFS_EXTENT_DATA_KEY) {
3197  btrfs_release_path(path);
3198  return -ENOENT;
3199  }
3200  } while (key.offset > start);
3201 
3202  fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3203  struct btrfs_file_extent_item);
3204  num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205  fi);
3206  if (key.offset + num_bytes <= start) {
3207  btrfs_release_path(path);
3208  return -ENOENT;
3209  }
3210  }
3211  args->src = path->nodes[0];
3212 next_slot:
3213  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214  fi = btrfs_item_ptr(args->src, path->slots[0],
3215  struct btrfs_file_extent_item);
3216  if (args->nr &&
3217  args->start_slot + args->nr == path->slots[0]) {
3218  args->nr++;
3219  } else if (args->nr) {
3220  ret = copy_items(trans, inode, dst_path, args->src,
3221  args->start_slot, args->nr,
3222  LOG_INODE_ALL);
3223  if (ret)
3224  return ret;
3225  args->nr = 1;
3226  args->start_slot = path->slots[0];
3227  } else if (!args->nr) {
3228  args->nr = 1;
3229  args->start_slot = path->slots[0];
3230  }
3231  nritems = btrfs_header_nritems(path->nodes[0]);
3232  path->slots[0]++;
3233  num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234  if (len < num_bytes) {
3235  /* I _think_ this is ok, envision we write to a
3236  * preallocated space that is adjacent to a previously
3237  * written preallocated space that gets merged when we
3238  * mark this preallocated space written. If we do not
3239  * have the adjacent extent in cache then when we copy
3240  * this extent it could end up being larger than our EM
3241  * thinks it is, which is a-ok, so just set len to 0.
3242  */
3243  len = 0;
3244  } else {
3245  len -= num_bytes;
3246  }
3247  start = key.offset + num_bytes;
3248  args->next_offset = start;
3249  search_start = start;
3250 
3251  if (path->slots[0] < nritems) {
3252  if (len)
3253  goto next_slot;
3254  break;
3255  }
3256 
3257  if (args->nr) {
3258  ret = copy_items(trans, inode, dst_path, args->src,
3259  args->start_slot, args->nr,
3260  LOG_INODE_ALL);
3261  if (ret)
3262  return ret;
3263  args->nr = 0;
3264  btrfs_release_path(path);
3265  }
3266  }
3267 
3268  return 0;
3269 }
3270 
3271 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272  struct btrfs_root *root,
3273  struct inode *inode,
3274  struct btrfs_path *path,
3275  struct btrfs_path *dst_path)
3276 {
3277  struct log_args args;
3278  struct extent_map *em, *n;
3279  struct list_head extents;
3280  struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3281  u64 test_gen;
3282  int ret = 0;
3283 
3284  INIT_LIST_HEAD(&extents);
3285 
3286  memset(&args, 0, sizeof(args));
3287 
3288  write_lock(&tree->lock);
3289  test_gen = root->fs_info->last_trans_committed;
3290 
3292  list_del_init(&em->list);
3293  if (em->generation <= test_gen)
3294  continue;
3295  /* Need a ref to keep it from getting evicted from cache */
3296  atomic_inc(&em->refs);
3298  list_add_tail(&em->list, &extents);
3299  }
3300 
3301  list_sort(NULL, &extents, extent_cmp);
3302 
3303  while (!list_empty(&extents)) {
3304  em = list_entry(extents.next, struct extent_map, list);
3305 
3306  list_del_init(&em->list);
3308 
3309  /*
3310  * If we had an error we just need to delete everybody from our
3311  * private list.
3312  */
3313  if (ret) {
3314  free_extent_map(em);
3315  continue;
3316  }
3317 
3318  write_unlock(&tree->lock);
3319 
3320  /*
3321  * If the previous EM and the last extent we left off on aren't
3322  * sequential then we need to copy the items we have and redo
3323  * our search
3324  */
3325  if (args.nr && em->mod_start != args.next_offset) {
3326  ret = copy_items(trans, inode, dst_path, args.src,
3327  args.start_slot, args.nr,
3328  LOG_INODE_ALL);
3329  if (ret) {
3330  free_extent_map(em);
3331  write_lock(&tree->lock);
3332  continue;
3333  }
3334  btrfs_release_path(path);
3335  args.nr = 0;
3336  }
3337 
3338  ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339  free_extent_map(em);
3340  write_lock(&tree->lock);
3341  }
3342  WARN_ON(!list_empty(&extents));
3343  write_unlock(&tree->lock);
3344 
3345  if (!ret && args.nr)
3346  ret = copy_items(trans, inode, dst_path, args.src,
3347  args.start_slot, args.nr, LOG_INODE_ALL);
3348  btrfs_release_path(path);
3349  return ret;
3350 }
3351 
3352 /* log a single inode in the tree log.
3353  * At least one parent directory for this inode must exist in the tree
3354  * or be logged already.
3355  *
3356  * Any items from this inode changed by the current transaction are copied
3357  * to the log tree. An extra reference is taken on any extents in this
3358  * file, allowing us to avoid a whole pile of corner cases around logging
3359  * blocks that have been removed from the tree.
3360  *
3361  * See LOG_INODE_ALL and related defines for a description of what inode_only
3362  * does.
3363  *
3364  * This handles both files and directories.
3365  */
3366 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3367  struct btrfs_root *root, struct inode *inode,
3368  int inode_only)
3369 {
3370  struct btrfs_path *path;
3371  struct btrfs_path *dst_path;
3372  struct btrfs_key min_key;
3373  struct btrfs_key max_key;
3374  struct btrfs_root *log = root->log_root;
3375  struct extent_buffer *src = NULL;
3376  int err = 0;
3377  int ret;
3378  int nritems;
3379  int ins_start_slot = 0;
3380  int ins_nr;
3381  bool fast_search = false;
3382  u64 ino = btrfs_ino(inode);
3383 
3384  log = root->log_root;
3385 
3386  path = btrfs_alloc_path();
3387  if (!path)
3388  return -ENOMEM;
3389  dst_path = btrfs_alloc_path();
3390  if (!dst_path) {
3391  btrfs_free_path(path);
3392  return -ENOMEM;
3393  }
3394 
3395  min_key.objectid = ino;
3396  min_key.type = BTRFS_INODE_ITEM_KEY;
3397  min_key.offset = 0;
3398 
3399  max_key.objectid = ino;
3400 
3401 
3402  /* today the code can only do partial logging of directories */
3403  if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
3404  max_key.type = BTRFS_XATTR_ITEM_KEY;
3405  else
3406  max_key.type = (u8)-1;
3407  max_key.offset = (u64)-1;
3408 
3409  /* Only run delayed items if we are a dir or a new file */
3410  if (S_ISDIR(inode->i_mode) ||
3411  BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3412  ret = btrfs_commit_inode_delayed_items(trans, inode);
3413  if (ret) {
3414  btrfs_free_path(path);
3415  btrfs_free_path(dst_path);
3416  return ret;
3417  }
3418  }
3419 
3420  mutex_lock(&BTRFS_I(inode)->log_mutex);
3421 
3422  /*
3423  * a brute force approach to making sure we get the most uptodate
3424  * copies of everything.
3425  */
3426  if (S_ISDIR(inode->i_mode)) {
3427  int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3428 
3429  if (inode_only == LOG_INODE_EXISTS)
3430  max_key_type = BTRFS_XATTR_ITEM_KEY;
3431  ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3432  } else {
3434  &BTRFS_I(inode)->runtime_flags)) {
3435  ret = btrfs_truncate_inode_items(trans, log,
3436  inode, 0, 0);
3437  } else {
3438  fast_search = true;
3439  max_key.type = BTRFS_XATTR_ITEM_KEY;
3440  ret = drop_objectid_items(trans, log, path, ino,
3442  }
3443  }
3444  if (ret) {
3445  err = ret;
3446  goto out_unlock;
3447  }
3448  path->keep_locks = 1;
3449 
3450  while (1) {
3451  ins_nr = 0;
3452  ret = btrfs_search_forward(root, &min_key, &max_key,
3453  path, 0, trans->transid);
3454  if (ret != 0)
3455  break;
3456 again:
3457  /* note, ins_nr might be > 0 here, cleanup outside the loop */
3458  if (min_key.objectid != ino)
3459  break;
3460  if (min_key.type > max_key.type)
3461  break;
3462 
3463  src = path->nodes[0];
3464  if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3465  ins_nr++;
3466  goto next_slot;
3467  } else if (!ins_nr) {
3468  ins_start_slot = path->slots[0];
3469  ins_nr = 1;
3470  goto next_slot;
3471  }
3472 
3473  ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3474  ins_nr, inode_only);
3475  if (ret) {
3476  err = ret;
3477  goto out_unlock;
3478  }
3479  ins_nr = 1;
3480  ins_start_slot = path->slots[0];
3481 next_slot:
3482 
3483  nritems = btrfs_header_nritems(path->nodes[0]);
3484  path->slots[0]++;
3485  if (path->slots[0] < nritems) {
3486  btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3487  path->slots[0]);
3488  goto again;
3489  }
3490  if (ins_nr) {
3491  ret = copy_items(trans, inode, dst_path, src,
3492  ins_start_slot,
3493  ins_nr, inode_only);
3494  if (ret) {
3495  err = ret;
3496  goto out_unlock;
3497  }
3498  ins_nr = 0;
3499  }
3500  btrfs_release_path(path);
3501 
3502  if (min_key.offset < (u64)-1)
3503  min_key.offset++;
3504  else if (min_key.type < (u8)-1)
3505  min_key.type++;
3506  else if (min_key.objectid < (u64)-1)
3507  min_key.objectid++;
3508  else
3509  break;
3510  }
3511  if (ins_nr) {
3512  ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3513  ins_nr, inode_only);
3514  if (ret) {
3515  err = ret;
3516  goto out_unlock;
3517  }
3518  ins_nr = 0;
3519  }
3520 
3521  if (fast_search) {
3522  btrfs_release_path(path);
3523  btrfs_release_path(dst_path);
3524  ret = btrfs_log_changed_extents(trans, root, inode, path,
3525  dst_path);
3526  if (ret) {
3527  err = ret;
3528  goto out_unlock;
3529  }
3530  } else {
3531  struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532  struct extent_map *em, *n;
3533 
3535  list_del_init(&em->list);
3536  }
3537 
3538  if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3539  btrfs_release_path(path);
3540  btrfs_release_path(dst_path);
3541  ret = log_directory_changes(trans, root, inode, path, dst_path);
3542  if (ret) {
3543  err = ret;
3544  goto out_unlock;
3545  }
3546  }
3547  BTRFS_I(inode)->logged_trans = trans->transid;
3548  BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3549 out_unlock:
3550  mutex_unlock(&BTRFS_I(inode)->log_mutex);
3551 
3552  btrfs_free_path(path);
3553  btrfs_free_path(dst_path);
3554  return err;
3555 }
3556 
3557 /*
3558  * follow the dentry parent pointers up the chain and see if any
3559  * of the directories in it require a full commit before they can
3560  * be logged. Returns zero if nothing special needs to be done or 1 if
3561  * a full commit is required.
3562  */
3563 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
3564  struct inode *inode,
3565  struct dentry *parent,
3566  struct super_block *sb,
3567  u64 last_committed)
3568 {
3569  int ret = 0;
3570  struct btrfs_root *root;
3571  struct dentry *old_parent = NULL;
3572 
3573  /*
3574  * for regular files, if its inode is already on disk, we don't
3575  * have to worry about the parents at all. This is because
3576  * we can use the last_unlink_trans field to record renames
3577  * and other fun in this file.
3578  */
3579  if (S_ISREG(inode->i_mode) &&
3580  BTRFS_I(inode)->generation <= last_committed &&
3581  BTRFS_I(inode)->last_unlink_trans <= last_committed)
3582  goto out;
3583 
3584  if (!S_ISDIR(inode->i_mode)) {
3585  if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3586  goto out;
3587  inode = parent->d_inode;
3588  }
3589 
3590  while (1) {
3591  BTRFS_I(inode)->logged_trans = trans->transid;
3592  smp_mb();
3593 
3594  if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3595  root = BTRFS_I(inode)->root;
3596 
3597  /*
3598  * make sure any commits to the log are forced
3599  * to be full commits
3600  */
3601  root->fs_info->last_trans_log_full_commit =
3602  trans->transid;
3603  ret = 1;
3604  break;
3605  }
3606 
3607  if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3608  break;
3609 
3610  if (IS_ROOT(parent))
3611  break;
3612 
3613  parent = dget_parent(parent);
3614  dput(old_parent);
3615  old_parent = parent;
3616  inode = parent->d_inode;
3617 
3618  }
3619  dput(old_parent);
3620 out:
3621  return ret;
3622 }
3623 
3624 /*
3625  * helper function around btrfs_log_inode to make sure newly created
3626  * parent directories also end up in the log. A minimal inode and backref
3627  * only logging is done of any parent directories that are older than
3628  * the last committed transaction
3629  */
3631  struct btrfs_root *root, struct inode *inode,
3632  struct dentry *parent, int exists_only)
3633 {
3634  int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3635  struct super_block *sb;
3636  struct dentry *old_parent = NULL;
3637  int ret = 0;
3638  u64 last_committed = root->fs_info->last_trans_committed;
3639 
3640  sb = inode->i_sb;
3641 
3642  if (btrfs_test_opt(root, NOTREELOG)) {
3643  ret = 1;
3644  goto end_no_trans;
3645  }
3646 
3647  if (root->fs_info->last_trans_log_full_commit >
3648  root->fs_info->last_trans_committed) {
3649  ret = 1;
3650  goto end_no_trans;
3651  }
3652 
3653  if (root != BTRFS_I(inode)->root ||
3654  btrfs_root_refs(&root->root_item) == 0) {
3655  ret = 1;
3656  goto end_no_trans;
3657  }
3658 
3659  ret = check_parent_dirs_for_sync(trans, inode, parent,
3660  sb, last_committed);
3661  if (ret)
3662  goto end_no_trans;
3663 
3664  if (btrfs_inode_in_log(inode, trans->transid)) {
3665  ret = BTRFS_NO_LOG_SYNC;
3666  goto end_no_trans;
3667  }
3668 
3669  ret = start_log_trans(trans, root);
3670  if (ret)
3671  goto end_trans;
3672 
3673  ret = btrfs_log_inode(trans, root, inode, inode_only);
3674  if (ret)
3675  goto end_trans;
3676 
3677  /*
3678  * for regular files, if its inode is already on disk, we don't
3679  * have to worry about the parents at all. This is because
3680  * we can use the last_unlink_trans field to record renames
3681  * and other fun in this file.
3682  */
3683  if (S_ISREG(inode->i_mode) &&
3684  BTRFS_I(inode)->generation <= last_committed &&
3685  BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3686  ret = 0;
3687  goto end_trans;
3688  }
3689 
3690  inode_only = LOG_INODE_EXISTS;
3691  while (1) {
3692  if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3693  break;
3694 
3695  inode = parent->d_inode;
3696  if (root != BTRFS_I(inode)->root)
3697  break;
3698 
3699  if (BTRFS_I(inode)->generation >
3700  root->fs_info->last_trans_committed) {
3701  ret = btrfs_log_inode(trans, root, inode, inode_only);
3702  if (ret)
3703  goto end_trans;
3704  }
3705  if (IS_ROOT(parent))
3706  break;
3707 
3708  parent = dget_parent(parent);
3709  dput(old_parent);
3710  old_parent = parent;
3711  }
3712  ret = 0;
3713 end_trans:
3714  dput(old_parent);
3715  if (ret < 0) {
3716  WARN_ON(ret != -ENOSPC);
3717  root->fs_info->last_trans_log_full_commit = trans->transid;
3718  ret = 1;
3719  }
3720  btrfs_end_log_trans(root);
3721 end_no_trans:
3722  return ret;
3723 }
3724 
3725 /*
3726  * it is not safe to log dentry if the chunk root has added new
3727  * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
3728  * If this returns 1, you must commit the transaction to safely get your
3729  * data on disk.
3730  */
3732  struct btrfs_root *root, struct dentry *dentry)
3733 {
3734  struct dentry *parent = dget_parent(dentry);
3735  int ret;
3736 
3737  ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3738  dput(parent);
3739 
3740  return ret;
3741 }
3742 
3743 /*
3744  * should be called during mount to recover any replay any log trees
3745  * from the FS
3746  */
3747 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3748 {
3749  int ret;
3750  struct btrfs_path *path;
3751  struct btrfs_trans_handle *trans;
3752  struct btrfs_key key;
3753  struct btrfs_key found_key;
3754  struct btrfs_key tmp_key;
3755  struct btrfs_root *log;
3756  struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
3757  struct walk_control wc = {
3758  .process_func = process_one_buffer,
3759  .stage = 0,
3760  };
3761 
3762  path = btrfs_alloc_path();
3763  if (!path)
3764  return -ENOMEM;
3765 
3766  fs_info->log_root_recovering = 1;
3767 
3768  trans = btrfs_start_transaction(fs_info->tree_root, 0);
3769  if (IS_ERR(trans)) {
3770  ret = PTR_ERR(trans);
3771  goto error;
3772  }
3773 
3774  wc.trans = trans;
3775  wc.pin = 1;
3776 
3777  ret = walk_log_tree(trans, log_root_tree, &wc);
3778  if (ret) {
3779  btrfs_error(fs_info, ret, "Failed to pin buffers while "
3780  "recovering log root tree.");
3781  goto error;
3782  }
3783 
3784 again:
3786  key.offset = (u64)-1;
3787  btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
3788 
3789  while (1) {
3790  ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
3791 
3792  if (ret < 0) {
3793  btrfs_error(fs_info, ret,
3794  "Couldn't find tree log root.");
3795  goto error;
3796  }
3797  if (ret > 0) {
3798  if (path->slots[0] == 0)
3799  break;
3800  path->slots[0]--;
3801  }
3802  btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3803  path->slots[0]);
3804  btrfs_release_path(path);
3805  if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3806  break;
3807 
3808  log = btrfs_read_fs_root_no_radix(log_root_tree,
3809  &found_key);
3810  if (IS_ERR(log)) {
3811  ret = PTR_ERR(log);
3812  btrfs_error(fs_info, ret,
3813  "Couldn't read tree log root.");
3814  goto error;
3815  }
3816 
3817  tmp_key.objectid = found_key.offset;
3818  tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3819  tmp_key.offset = (u64)-1;
3820 
3821  wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
3822  if (IS_ERR(wc.replay_dest)) {
3823  ret = PTR_ERR(wc.replay_dest);
3824  btrfs_error(fs_info, ret, "Couldn't read target root "
3825  "for tree log recovery.");
3826  goto error;
3827  }
3828 
3829  wc.replay_dest->log_root = log;
3831  ret = walk_log_tree(trans, log, &wc);
3832  BUG_ON(ret);
3833 
3834  if (wc.stage == LOG_WALK_REPLAY_ALL) {
3835  ret = fixup_inode_link_counts(trans, wc.replay_dest,
3836  path);
3837  BUG_ON(ret);
3838  }
3839 
3840  key.offset = found_key.offset - 1;
3841  wc.replay_dest->log_root = NULL;
3842  free_extent_buffer(log->node);
3844  kfree(log);
3845 
3846  if (found_key.offset == 0)
3847  break;
3848  }
3849  btrfs_release_path(path);
3850 
3851  /* step one is to pin it all, step two is to replay just inodes */
3852  if (wc.pin) {
3853  wc.pin = 0;
3854  wc.process_func = replay_one_buffer;
3856  goto again;
3857  }
3858  /* step three is to replay everything */
3859  if (wc.stage < LOG_WALK_REPLAY_ALL) {
3860  wc.stage++;
3861  goto again;
3862  }
3863 
3864  btrfs_free_path(path);
3865 
3866  free_extent_buffer(log_root_tree->node);
3867  log_root_tree->log_root = NULL;
3868  fs_info->log_root_recovering = 0;
3869 
3870  /* step 4: commit the transaction, which also unpins the blocks */
3871  btrfs_commit_transaction(trans, fs_info->tree_root);
3872 
3873  kfree(log_root_tree);
3874  return 0;
3875 
3876 error:
3877  btrfs_free_path(path);
3878  return ret;
3879 }
3880 
3881 /*
3882  * there are some corner cases where we want to force a full
3883  * commit instead of allowing a directory to be logged.
3884  *
3885  * They revolve around files there were unlinked from the directory, and
3886  * this function updates the parent directory so that a full commit is
3887  * properly done if it is fsync'd later after the unlinks are done.
3888  */
3890  struct inode *dir, struct inode *inode,
3891  int for_rename)
3892 {
3893  /*
3894  * when we're logging a file, if it hasn't been renamed
3895  * or unlinked, and its inode is fully committed on disk,
3896  * we don't have to worry about walking up the directory chain
3897  * to log its parents.
3898  *
3899  * So, we use the last_unlink_trans field to put this transid
3900  * into the file. When the file is logged we check it and
3901  * don't log the parents if the file is fully on disk.
3902  */
3903  if (S_ISREG(inode->i_mode))
3904  BTRFS_I(inode)->last_unlink_trans = trans->transid;
3905 
3906  /*
3907  * if this directory was already logged any new
3908  * names for this file/dir will get recorded
3909  */
3910  smp_mb();
3911  if (BTRFS_I(dir)->logged_trans == trans->transid)
3912  return;
3913 
3914  /*
3915  * if the inode we're about to unlink was logged,
3916  * the log will be properly updated for any new names
3917  */
3918  if (BTRFS_I(inode)->logged_trans == trans->transid)
3919  return;
3920 
3921  /*
3922  * when renaming files across directories, if the directory
3923  * there we're unlinking from gets fsync'd later on, there's
3924  * no way to find the destination directory later and fsync it
3925  * properly. So, we have to be conservative and force commits
3926  * so the new name gets discovered.
3927  */
3928  if (for_rename)
3929  goto record;
3930 
3931  /* we can safely do the unlink without any special recording */
3932  return;
3933 
3934 record:
3935  BTRFS_I(dir)->last_unlink_trans = trans->transid;
3936 }
3937 
3938 /*
3939  * Call this after adding a new name for a file and it will properly
3940  * update the log to reflect the new name.
3941  *
3942  * It will return zero if all goes well, and it will return 1 if a
3943  * full transaction commit is required.
3944  */
3946  struct inode *inode, struct inode *old_dir,
3947  struct dentry *parent)
3948 {
3949  struct btrfs_root * root = BTRFS_I(inode)->root;
3950 
3951  /*
3952  * this will force the logging code to walk the dentry chain
3953  * up for the file
3954  */
3955  if (S_ISREG(inode->i_mode))
3956  BTRFS_I(inode)->last_unlink_trans = trans->transid;
3957 
3958  /*
3959  * if this inode hasn't been logged and directory we're renaming it
3960  * from hasn't been logged, we don't need to log it
3961  */
3962  if (BTRFS_I(inode)->logged_trans <=
3963  root->fs_info->last_trans_committed &&
3964  (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3965  root->fs_info->last_trans_committed))
3966  return 0;
3967 
3968  return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3969 }
3970