Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
refcounttree.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * refcounttree.c
5  *
6  * Copyright (C) 2009 Oracle. All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License version 2 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * General Public License for more details.
16  */
17 
18 #include <linux/sort.h>
19 #include <cluster/masklog.h>
20 #include "ocfs2.h"
21 #include "inode.h"
22 #include "alloc.h"
23 #include "suballoc.h"
24 #include "journal.h"
25 #include "uptodate.h"
26 #include "super.h"
27 #include "buffer_head_io.h"
28 #include "blockcheck.h"
29 #include "refcounttree.h"
30 #include "sysfile.h"
31 #include "dlmglue.h"
32 #include "extent_map.h"
33 #include "aops.h"
34 #include "xattr.h"
35 #include "namei.h"
36 #include "ocfs2_trace.h"
37 
38 #include <linux/bio.h>
39 #include <linux/blkdev.h>
40 #include <linux/slab.h>
41 #include <linux/writeback.h>
42 #include <linux/pagevec.h>
43 #include <linux/swap.h>
44 #include <linux/security.h>
45 #include <linux/fsnotify.h>
46 #include <linux/quotaops.h>
47 #include <linux/namei.h>
48 #include <linux/mount.h>
49 
51  struct inode *inode;
52  struct file *file;
57  struct buffer_head *ref_root_bh;
61  void *cow_object;
65  u32 v_cluster, u32 *p_cluster,
66  u32 *num_clusters,
67  unsigned int *extent_flags);
69  struct file *file,
70  u32 cpos, u32 old_cluster,
71  u32 new_cluster, u32 new_len);
72 };
73 
74 static inline struct ocfs2_refcount_tree *
75 cache_info_to_refcount(struct ocfs2_caching_info *ci)
76 {
77  return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78 }
79 
80 static int ocfs2_validate_refcount_block(struct super_block *sb,
81  struct buffer_head *bh)
82 {
83  int rc;
84  struct ocfs2_refcount_block *rb =
85  (struct ocfs2_refcount_block *)bh->b_data;
86 
87  trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
88 
89  BUG_ON(!buffer_uptodate(bh));
90 
91  /*
92  * If the ecc fails, we return the error but otherwise
93  * leave the filesystem running. We know any error is
94  * local to this block.
95  */
96  rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
97  if (rc) {
98  mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
99  (unsigned long long)bh->b_blocknr);
100  return rc;
101  }
102 
103 
105  ocfs2_error(sb,
106  "Refcount block #%llu has bad signature %.*s",
107  (unsigned long long)bh->b_blocknr, 7,
108  rb->rf_signature);
109  return -EINVAL;
110  }
111 
112  if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
113  ocfs2_error(sb,
114  "Refcount block #%llu has an invalid rf_blkno "
115  "of %llu",
116  (unsigned long long)bh->b_blocknr,
117  (unsigned long long)le64_to_cpu(rb->rf_blkno));
118  return -EINVAL;
119  }
120 
122  ocfs2_error(sb,
123  "Refcount block #%llu has an invalid "
124  "rf_fs_generation of #%u",
125  (unsigned long long)bh->b_blocknr,
127  return -EINVAL;
128  }
129 
130  return 0;
131 }
132 
133 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
134  u64 rb_blkno,
135  struct buffer_head **bh)
136 {
137  int rc;
138  struct buffer_head *tmp = *bh;
139 
140  rc = ocfs2_read_block(ci, rb_blkno, &tmp,
141  ocfs2_validate_refcount_block);
142 
143  /* If ocfs2_read_block() got us a new bh, pass it up. */
144  if (!rc && !*bh)
145  *bh = tmp;
146 
147  return rc;
148 }
149 
150 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
151 {
152  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
153 
154  return rf->rf_blkno;
155 }
156 
157 static struct super_block *
158 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
159 {
160  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
161 
162  return rf->rf_sb;
163 }
164 
165 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
166 {
167  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
168 
169  spin_lock(&rf->rf_lock);
170 }
171 
172 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
173 {
174  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
175 
176  spin_unlock(&rf->rf_lock);
177 }
178 
179 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
180 {
181  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
182 
183  mutex_lock(&rf->rf_io_mutex);
184 }
185 
186 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
187 {
188  struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
189 
191 }
192 
193 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
194  .co_owner = ocfs2_refcount_cache_owner,
195  .co_get_super = ocfs2_refcount_cache_get_super,
196  .co_cache_lock = ocfs2_refcount_cache_lock,
197  .co_cache_unlock = ocfs2_refcount_cache_unlock,
198  .co_io_lock = ocfs2_refcount_cache_io_lock,
199  .co_io_unlock = ocfs2_refcount_cache_io_unlock,
200 };
201 
202 static struct ocfs2_refcount_tree *
203 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
204 {
205  struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
206  struct ocfs2_refcount_tree *tree = NULL;
207 
208  while (n) {
209  tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
210 
211  if (blkno < tree->rf_blkno)
212  n = n->rb_left;
213  else if (blkno > tree->rf_blkno)
214  n = n->rb_right;
215  else
216  return tree;
217  }
218 
219  return NULL;
220 }
221 
222 /* osb_lock is already locked. */
223 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
224  struct ocfs2_refcount_tree *new)
225 {
226  u64 rf_blkno = new->rf_blkno;
227  struct rb_node *parent = NULL;
228  struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
229  struct ocfs2_refcount_tree *tmp;
230 
231  while (*p) {
232  parent = *p;
233 
234  tmp = rb_entry(parent, struct ocfs2_refcount_tree,
235  rf_node);
236 
237  if (rf_blkno < tmp->rf_blkno)
238  p = &(*p)->rb_left;
239  else if (rf_blkno > tmp->rf_blkno)
240  p = &(*p)->rb_right;
241  else {
242  /* This should never happen! */
243  mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
244  (unsigned long long)rf_blkno);
245  BUG();
246  }
247  }
248 
249  rb_link_node(&new->rf_node, parent, p);
250  rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
251 }
252 
253 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
254 {
258  kfree(tree);
259 }
260 
261 static inline void
262 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
263  struct ocfs2_refcount_tree *tree)
264 {
265  rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
266  if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
267  osb->osb_ref_tree_lru = NULL;
268 }
269 
270 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
271  struct ocfs2_refcount_tree *tree)
272 {
273  spin_lock(&osb->osb_lock);
274  ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
275  spin_unlock(&osb->osb_lock);
276 }
277 
278 static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
279 {
280  struct ocfs2_refcount_tree *tree =
282 
283  ocfs2_free_refcount_tree(tree);
284 }
285 
286 static inline void
287 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
288 {
289  kref_get(&tree->rf_getcnt);
290 }
291 
292 static inline void
293 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
294 {
295  kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
296 }
297 
298 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
299  struct super_block *sb)
300 {
301  ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
302  mutex_init(&new->rf_io_mutex);
303  new->rf_sb = sb;
304  spin_lock_init(&new->rf_lock);
305 }
306 
307 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
308  struct ocfs2_refcount_tree *new,
309  u64 rf_blkno, u32 generation)
310 {
311  init_rwsem(&new->rf_sem);
312  ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
313  rf_blkno, generation);
314 }
315 
316 static struct ocfs2_refcount_tree*
317 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
318 {
319  struct ocfs2_refcount_tree *new;
320 
321  new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
322  if (!new)
323  return NULL;
324 
325  new->rf_blkno = rf_blkno;
326  kref_init(&new->rf_getcnt);
327  ocfs2_init_refcount_tree_ci(new, osb->sb);
328 
329  return new;
330 }
331 
332 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
333  struct ocfs2_refcount_tree **ret_tree)
334 {
335  int ret = 0;
336  struct ocfs2_refcount_tree *tree, *new = NULL;
337  struct buffer_head *ref_root_bh = NULL;
338  struct ocfs2_refcount_block *ref_rb;
339 
340  spin_lock(&osb->osb_lock);
341  if (osb->osb_ref_tree_lru &&
342  osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
343  tree = osb->osb_ref_tree_lru;
344  else
345  tree = ocfs2_find_refcount_tree(osb, rf_blkno);
346  if (tree)
347  goto out;
348 
349  spin_unlock(&osb->osb_lock);
350 
351  new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
352  if (!new) {
353  ret = -ENOMEM;
354  mlog_errno(ret);
355  return ret;
356  }
357  /*
358  * We need the generation to create the refcount tree lock and since
359  * it isn't changed during the tree modification, we are safe here to
360  * read without protection.
361  * We also have to purge the cache after we create the lock since the
362  * refcount block may have the stale data. It can only be trusted when
363  * we hold the refcount lock.
364  */
365  ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
366  if (ret) {
367  mlog_errno(ret);
368  ocfs2_metadata_cache_exit(&new->rf_ci);
369  kfree(new);
370  return ret;
371  }
372 
373  ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
374  new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
375  ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
376  new->rf_generation);
377  ocfs2_metadata_cache_purge(&new->rf_ci);
378 
379  spin_lock(&osb->osb_lock);
380  tree = ocfs2_find_refcount_tree(osb, rf_blkno);
381  if (tree)
382  goto out;
383 
384  ocfs2_insert_refcount_tree(osb, new);
385 
386  tree = new;
387  new = NULL;
388 
389 out:
390  *ret_tree = tree;
391 
392  osb->osb_ref_tree_lru = tree;
393 
394  spin_unlock(&osb->osb_lock);
395 
396  if (new)
397  ocfs2_free_refcount_tree(new);
398 
399  brelse(ref_root_bh);
400  return ret;
401 }
402 
403 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
404 {
405  int ret;
406  struct buffer_head *di_bh = NULL;
407  struct ocfs2_dinode *di;
408 
409  ret = ocfs2_read_inode_block(inode, &di_bh);
410  if (ret) {
411  mlog_errno(ret);
412  goto out;
413  }
414 
415  BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
416 
417  di = (struct ocfs2_dinode *)di_bh->b_data;
418  *ref_blkno = le64_to_cpu(di->i_refcount_loc);
419  brelse(di_bh);
420 out:
421  return ret;
422 }
423 
424 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
425  struct ocfs2_refcount_tree *tree, int rw)
426 {
427  int ret;
428 
429  ret = ocfs2_refcount_lock(tree, rw);
430  if (ret) {
431  mlog_errno(ret);
432  goto out;
433  }
434 
435  if (rw)
436  down_write(&tree->rf_sem);
437  else
438  down_read(&tree->rf_sem);
439 
440 out:
441  return ret;
442 }
443 
444 /*
445  * Lock the refcount tree pointed by ref_blkno and return the tree.
446  * In most case, we lock the tree and read the refcount block.
447  * So read it here if the caller really needs it.
448  *
449  * If the tree has been re-created by other node, it will free the
450  * old one and re-create it.
451  */
453  u64 ref_blkno, int rw,
454  struct ocfs2_refcount_tree **ret_tree,
455  struct buffer_head **ref_bh)
456 {
457  int ret, delete_tree = 0;
458  struct ocfs2_refcount_tree *tree = NULL;
459  struct buffer_head *ref_root_bh = NULL;
460  struct ocfs2_refcount_block *rb;
461 
462 again:
463  ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
464  if (ret) {
465  mlog_errno(ret);
466  return ret;
467  }
468 
469  ocfs2_refcount_tree_get(tree);
470 
471  ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
472  if (ret) {
473  mlog_errno(ret);
474  ocfs2_refcount_tree_put(tree);
475  goto out;
476  }
477 
478  ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
479  &ref_root_bh);
480  if (ret) {
481  mlog_errno(ret);
482  ocfs2_unlock_refcount_tree(osb, tree, rw);
483  ocfs2_refcount_tree_put(tree);
484  goto out;
485  }
486 
487  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
488  /*
489  * If the refcount block has been freed and re-created, we may need
490  * to recreate the refcount tree also.
491  *
492  * Here we just remove the tree from the rb-tree, and the last
493  * kref holder will unlock and delete this refcount_tree.
494  * Then we goto "again" and ocfs2_get_refcount_tree will create
495  * the new refcount tree for us.
496  */
497  if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
498  if (!tree->rf_removed) {
499  ocfs2_erase_refcount_tree_from_list(osb, tree);
500  tree->rf_removed = 1;
501  delete_tree = 1;
502  }
503 
504  ocfs2_unlock_refcount_tree(osb, tree, rw);
505  /*
506  * We get an extra reference when we create the refcount
507  * tree, so another put will destroy it.
508  */
509  if (delete_tree)
510  ocfs2_refcount_tree_put(tree);
511  brelse(ref_root_bh);
512  ref_root_bh = NULL;
513  goto again;
514  }
515 
516  *ret_tree = tree;
517  if (ref_bh) {
518  *ref_bh = ref_root_bh;
519  ref_root_bh = NULL;
520  }
521 out:
522  brelse(ref_root_bh);
523  return ret;
524 }
525 
527  struct ocfs2_refcount_tree *tree, int rw)
528 {
529  if (rw)
530  up_write(&tree->rf_sem);
531  else
532  up_read(&tree->rf_sem);
533 
534  ocfs2_refcount_unlock(tree, rw);
535  ocfs2_refcount_tree_put(tree);
536 }
537 
539 {
540  struct rb_node *node;
541  struct ocfs2_refcount_tree *tree;
542  struct rb_root *root = &osb->osb_rf_lock_tree;
543 
544  while ((node = rb_last(root)) != NULL) {
545  tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
546 
547  trace_ocfs2_purge_refcount_trees(
548  (unsigned long long) tree->rf_blkno);
549 
550  rb_erase(&tree->rf_node, root);
551  ocfs2_free_refcount_tree(tree);
552  }
553 }
554 
555 /*
556  * Create a refcount tree for an inode.
557  * We take for granted that the inode is already locked.
558  */
559 static int ocfs2_create_refcount_tree(struct inode *inode,
560  struct buffer_head *di_bh)
561 {
562  int ret;
563  handle_t *handle = NULL;
564  struct ocfs2_alloc_context *meta_ac = NULL;
565  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
566  struct ocfs2_inode_info *oi = OCFS2_I(inode);
567  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
568  struct buffer_head *new_bh = NULL;
569  struct ocfs2_refcount_block *rb;
570  struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
571  u16 suballoc_bit_start;
572  u32 num_got;
573  u64 suballoc_loc, first_blkno;
574 
575  BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
576 
577  trace_ocfs2_create_refcount_tree(
578  (unsigned long long)OCFS2_I(inode)->ip_blkno);
579 
580  ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
581  if (ret) {
582  mlog_errno(ret);
583  goto out;
584  }
585 
587  if (IS_ERR(handle)) {
588  ret = PTR_ERR(handle);
589  mlog_errno(ret);
590  goto out;
591  }
592 
593  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
595  if (ret) {
596  mlog_errno(ret);
597  goto out_commit;
598  }
599 
600  ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
601  &suballoc_bit_start, &num_got,
602  &first_blkno);
603  if (ret) {
604  mlog_errno(ret);
605  goto out_commit;
606  }
607 
608  new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
609  if (!new_tree) {
610  ret = -ENOMEM;
611  mlog_errno(ret);
612  goto out_commit;
613  }
614 
615  new_bh = sb_getblk(inode->i_sb, first_blkno);
616  ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
617 
618  ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
620  if (ret) {
621  mlog_errno(ret);
622  goto out_commit;
623  }
624 
625  /* Initialize ocfs2_refcount_block. */
626  rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627  memset(rb, 0, inode->i_sb->s_blocksize);
630  rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
631  rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
633  rb->rf_blkno = cpu_to_le64(first_blkno);
634  rb->rf_count = cpu_to_le32(1);
635  rb->rf_records.rl_count =
636  cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
637  spin_lock(&osb->osb_lock);
638  rb->rf_generation = osb->s_next_generation++;
639  spin_unlock(&osb->osb_lock);
640 
641  ocfs2_journal_dirty(handle, new_bh);
642 
643  spin_lock(&oi->ip_lock);
644  oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
645  di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
646  di->i_refcount_loc = cpu_to_le64(first_blkno);
647  spin_unlock(&oi->ip_lock);
648 
649  trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
650 
651  ocfs2_journal_dirty(handle, di_bh);
652 
653  /*
654  * We have to init the tree lock here since it will use
655  * the generation number to create it.
656  */
657  new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
658  ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
659  new_tree->rf_generation);
660 
661  spin_lock(&osb->osb_lock);
662  tree = ocfs2_find_refcount_tree(osb, first_blkno);
663 
664  /*
665  * We've just created a new refcount tree in this block. If
666  * we found a refcount tree on the ocfs2_super, it must be
667  * one we just deleted. We free the old tree before
668  * inserting the new tree.
669  */
670  BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
671  if (tree)
672  ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
673  ocfs2_insert_refcount_tree(osb, new_tree);
674  spin_unlock(&osb->osb_lock);
675  new_tree = NULL;
676  if (tree)
677  ocfs2_refcount_tree_put(tree);
678 
679 out_commit:
680  ocfs2_commit_trans(osb, handle);
681 
682 out:
683  if (new_tree) {
684  ocfs2_metadata_cache_exit(&new_tree->rf_ci);
685  kfree(new_tree);
686  }
687 
688  brelse(new_bh);
689  if (meta_ac)
690  ocfs2_free_alloc_context(meta_ac);
691 
692  return ret;
693 }
694 
695 static int ocfs2_set_refcount_tree(struct inode *inode,
696  struct buffer_head *di_bh,
697  u64 refcount_loc)
698 {
699  int ret;
700  handle_t *handle = NULL;
701  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
702  struct ocfs2_inode_info *oi = OCFS2_I(inode);
703  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
704  struct buffer_head *ref_root_bh = NULL;
705  struct ocfs2_refcount_block *rb;
706  struct ocfs2_refcount_tree *ref_tree;
707 
708  BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
709 
710  ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
711  &ref_tree, &ref_root_bh);
712  if (ret) {
713  mlog_errno(ret);
714  return ret;
715  }
716 
718  if (IS_ERR(handle)) {
719  ret = PTR_ERR(handle);
720  mlog_errno(ret);
721  goto out;
722  }
723 
724  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
726  if (ret) {
727  mlog_errno(ret);
728  goto out_commit;
729  }
730 
731  ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
733  if (ret) {
734  mlog_errno(ret);
735  goto out_commit;
736  }
737 
738  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
739  le32_add_cpu(&rb->rf_count, 1);
740 
741  ocfs2_journal_dirty(handle, ref_root_bh);
742 
743  spin_lock(&oi->ip_lock);
744  oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
745  di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
746  di->i_refcount_loc = cpu_to_le64(refcount_loc);
747  spin_unlock(&oi->ip_lock);
748  ocfs2_journal_dirty(handle, di_bh);
749 
750 out_commit:
751  ocfs2_commit_trans(osb, handle);
752 out:
753  ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
754  brelse(ref_root_bh);
755 
756  return ret;
757 }
758 
759 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
760 {
761  int ret, delete_tree = 0;
762  handle_t *handle = NULL;
763  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
764  struct ocfs2_inode_info *oi = OCFS2_I(inode);
765  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
766  struct ocfs2_refcount_block *rb;
767  struct inode *alloc_inode = NULL;
768  struct buffer_head *alloc_bh = NULL;
769  struct buffer_head *blk_bh = NULL;
770  struct ocfs2_refcount_tree *ref_tree;
772  u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
773  u16 bit = 0;
774 
775  if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
776  return 0;
777 
778  BUG_ON(!ref_blkno);
779  ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
780  if (ret) {
781  mlog_errno(ret);
782  return ret;
783  }
784 
785  rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
786 
787  /*
788  * If we are the last user, we need to free the block.
789  * So lock the allocator ahead.
790  */
791  if (le32_to_cpu(rb->rf_count) == 1) {
792  blk = le64_to_cpu(rb->rf_blkno);
793  bit = le16_to_cpu(rb->rf_suballoc_bit);
794  if (rb->rf_suballoc_loc)
795  bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796  else
797  bg_blkno = ocfs2_which_suballoc_group(blk, bit);
798 
799  alloc_inode = ocfs2_get_system_file_inode(osb,
802  if (!alloc_inode) {
803  ret = -ENOMEM;
804  mlog_errno(ret);
805  goto out;
806  }
807  mutex_lock(&alloc_inode->i_mutex);
808 
809  ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
810  if (ret) {
811  mlog_errno(ret);
812  goto out_mutex;
813  }
814 
815  credits += OCFS2_SUBALLOC_FREE;
816  }
817 
818  handle = ocfs2_start_trans(osb, credits);
819  if (IS_ERR(handle)) {
820  ret = PTR_ERR(handle);
821  mlog_errno(ret);
822  goto out_unlock;
823  }
824 
825  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
827  if (ret) {
828  mlog_errno(ret);
829  goto out_commit;
830  }
831 
832  ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
834  if (ret) {
835  mlog_errno(ret);
836  goto out_commit;
837  }
838 
839  spin_lock(&oi->ip_lock);
840  oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
841  di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
842  di->i_refcount_loc = 0;
843  spin_unlock(&oi->ip_lock);
844  ocfs2_journal_dirty(handle, di_bh);
845 
846  le32_add_cpu(&rb->rf_count , -1);
847  ocfs2_journal_dirty(handle, blk_bh);
848 
849  if (!rb->rf_count) {
850  delete_tree = 1;
851  ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
852  ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
853  alloc_bh, bit, bg_blkno, 1);
854  if (ret)
855  mlog_errno(ret);
856  }
857 
858 out_commit:
859  ocfs2_commit_trans(osb, handle);
860 out_unlock:
861  if (alloc_inode) {
862  ocfs2_inode_unlock(alloc_inode, 1);
863  brelse(alloc_bh);
864  }
865 out_mutex:
866  if (alloc_inode) {
867  mutex_unlock(&alloc_inode->i_mutex);
868  iput(alloc_inode);
869  }
870 out:
871  ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
872  if (delete_tree)
873  ocfs2_refcount_tree_put(ref_tree);
874  brelse(blk_bh);
875 
876  return ret;
877 }
878 
879 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
880  struct buffer_head *ref_leaf_bh,
881  u64 cpos, unsigned int len,
882  struct ocfs2_refcount_rec *ret_rec,
883  int *index)
884 {
885  int i = 0;
886  struct ocfs2_refcount_block *rb =
887  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
888  struct ocfs2_refcount_rec *rec = NULL;
889 
890  for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
891  rec = &rb->rf_records.rl_recs[i];
892 
893  if (le64_to_cpu(rec->r_cpos) +
894  le32_to_cpu(rec->r_clusters) <= cpos)
895  continue;
896  else if (le64_to_cpu(rec->r_cpos) > cpos)
897  break;
898 
899  /* ok, cpos fail in this rec. Just return. */
900  if (ret_rec)
901  *ret_rec = *rec;
902  goto out;
903  }
904 
905  if (ret_rec) {
906  /* We meet with a hole here, so fake the rec. */
907  ret_rec->r_cpos = cpu_to_le64(cpos);
908  ret_rec->r_refcount = 0;
909  if (i < le16_to_cpu(rb->rf_records.rl_used) &&
910  le64_to_cpu(rec->r_cpos) < cpos + len)
911  ret_rec->r_clusters =
912  cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
913  else
914  ret_rec->r_clusters = cpu_to_le32(len);
915  }
916 
917 out:
918  *index = i;
919 }
920 
921 /*
922  * Try to remove refcount tree. The mechanism is:
923  * 1) Check whether i_clusters == 0, if no, exit.
924  * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
925  * 3) Check whether we have inline xattr stored outside, if yes, exit.
926  * 4) Remove the tree.
927  */
928 int ocfs2_try_remove_refcount_tree(struct inode *inode,
929  struct buffer_head *di_bh)
930 {
931  int ret;
932  struct ocfs2_inode_info *oi = OCFS2_I(inode);
933  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
934 
935  down_write(&oi->ip_xattr_sem);
936  down_write(&oi->ip_alloc_sem);
937 
938  if (oi->ip_clusters)
939  goto out;
940 
942  goto out;
943 
946  goto out;
947 
948  ret = ocfs2_remove_refcount_tree(inode, di_bh);
949  if (ret)
950  mlog_errno(ret);
951 out:
952  up_write(&oi->ip_alloc_sem);
953  up_write(&oi->ip_xattr_sem);
954  return 0;
955 }
956 
957 /*
958  * Find the end range for a leaf refcount block indicated by
959  * el->l_recs[index].e_blkno.
960  */
961 static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
962  struct buffer_head *ref_root_bh,
963  struct ocfs2_extent_block *eb,
964  struct ocfs2_extent_list *el,
965  int index, u32 *cpos_end)
966 {
967  int ret, i, subtree_root;
968  u32 cpos;
969  u64 blkno;
971  struct ocfs2_path *left_path = NULL, *right_path = NULL;
972  struct ocfs2_extent_tree et;
973  struct ocfs2_extent_list *tmp_el;
974 
975  if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
976  /*
977  * We have a extent rec after index, so just use the e_cpos
978  * of the next extent rec.
979  */
980  *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
981  return 0;
982  }
983 
984  if (!eb || (eb && !eb->h_next_leaf_blk)) {
985  /*
986  * We are the last extent rec, so any high cpos should
987  * be stored in this leaf refcount block.
988  */
989  *cpos_end = UINT_MAX;
990  return 0;
991  }
992 
993  /*
994  * If the extent block isn't the last one, we have to find
995  * the subtree root between this extent block and the next
996  * leaf extent block and get the corresponding e_cpos from
997  * the subroot. Otherwise we may corrupt the b-tree.
998  */
999  ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1000 
1001  left_path = ocfs2_new_path_from_et(&et);
1002  if (!left_path) {
1003  ret = -ENOMEM;
1004  mlog_errno(ret);
1005  goto out;
1006  }
1007 
1008  cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1009  ret = ocfs2_find_path(ci, left_path, cpos);
1010  if (ret) {
1011  mlog_errno(ret);
1012  goto out;
1013  }
1014 
1015  right_path = ocfs2_new_path_from_path(left_path);
1016  if (!right_path) {
1017  ret = -ENOMEM;
1018  mlog_errno(ret);
1019  goto out;
1020  }
1021 
1022  ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1023  if (ret) {
1024  mlog_errno(ret);
1025  goto out;
1026  }
1027 
1028  ret = ocfs2_find_path(ci, right_path, cpos);
1029  if (ret) {
1030  mlog_errno(ret);
1031  goto out;
1032  }
1033 
1034  subtree_root = ocfs2_find_subtree_root(&et, left_path,
1035  right_path);
1036 
1037  tmp_el = left_path->p_node[subtree_root].el;
1038  blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1039  for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
1040  if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1041  *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1042  break;
1043  }
1044  }
1045 
1046  BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
1047 
1048 out:
1049  ocfs2_free_path(left_path);
1050  ocfs2_free_path(right_path);
1051  return ret;
1052 }
1053 
1054 /*
1055  * Given a cpos and len, try to find the refcount record which contains cpos.
1056  * 1. If cpos can be found in one refcount record, return the record.
1057  * 2. If cpos can't be found, return a fake record which start from cpos
1058  * and end at a small value between cpos+len and start of the next record.
1059  * This fake record has r_refcount = 0.
1060  */
1061 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1062  struct buffer_head *ref_root_bh,
1063  u64 cpos, unsigned int len,
1064  struct ocfs2_refcount_rec *ret_rec,
1065  int *index,
1066  struct buffer_head **ret_bh)
1067 {
1068  int ret = 0, i, found;
1069  u32 low_cpos, uninitialized_var(cpos_end);
1070  struct ocfs2_extent_list *el;
1071  struct ocfs2_extent_rec *rec = NULL;
1072  struct ocfs2_extent_block *eb = NULL;
1073  struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1075  struct ocfs2_refcount_block *rb =
1076  (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1077 
1079  ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1080  ret_rec, index);
1081  *ret_bh = ref_root_bh;
1082  get_bh(ref_root_bh);
1083  return 0;
1084  }
1085 
1086  el = &rb->rf_list;
1087  low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1088 
1089  if (el->l_tree_depth) {
1090  ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1091  if (ret) {
1092  mlog_errno(ret);
1093  goto out;
1094  }
1095 
1096  eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1097  el = &eb->h_list;
1098 
1099  if (el->l_tree_depth) {
1100  ocfs2_error(sb,
1101  "refcount tree %llu has non zero tree "
1102  "depth in leaf btree tree block %llu\n",
1103  (unsigned long long)ocfs2_metadata_cache_owner(ci),
1104  (unsigned long long)eb_bh->b_blocknr);
1105  ret = -EROFS;
1106  goto out;
1107  }
1108  }
1109 
1110  found = 0;
1111  for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1112  rec = &el->l_recs[i];
1113 
1114  if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1115  found = 1;
1116  break;
1117  }
1118  }
1119 
1120  if (found) {
1121  ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1122  eb, el, i, &cpos_end);
1123  if (ret) {
1124  mlog_errno(ret);
1125  goto out;
1126  }
1127 
1128  if (cpos_end < low_cpos + len)
1129  len = cpos_end - low_cpos;
1130  }
1131 
1132  ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1133  &ref_leaf_bh);
1134  if (ret) {
1135  mlog_errno(ret);
1136  goto out;
1137  }
1138 
1139  ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1140  ret_rec, index);
1141  *ret_bh = ref_leaf_bh;
1142 out:
1143  brelse(eb_bh);
1144  return ret;
1145 }
1146 
1152 };
1153 
1154 static enum ocfs2_ref_rec_contig
1155  ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1156  int index)
1157 {
1158  if ((rb->rf_records.rl_recs[index].r_refcount ==
1159  rb->rf_records.rl_recs[index + 1].r_refcount) &&
1160  (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1161  le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1162  le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1163  return REF_CONTIG_RIGHT;
1164 
1165  return REF_CONTIG_NONE;
1166 }
1167 
1168 static enum ocfs2_ref_rec_contig
1169  ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1170  int index)
1171 {
1173 
1174  if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1175  ret = ocfs2_refcount_rec_adjacent(rb, index);
1176 
1177  if (index > 0) {
1179 
1180  tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1181 
1182  if (tmp == REF_CONTIG_RIGHT) {
1183  if (ret == REF_CONTIG_RIGHT)
1184  ret = REF_CONTIG_LEFTRIGHT;
1185  else
1186  ret = REF_CONTIG_LEFT;
1187  }
1188  }
1189 
1190  return ret;
1191 }
1192 
1193 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1194  int index)
1195 {
1196  BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1197  rb->rf_records.rl_recs[index+1].r_refcount);
1198 
1199  le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1200  le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1201 
1202  if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1203  memmove(&rb->rf_records.rl_recs[index + 1],
1204  &rb->rf_records.rl_recs[index + 2],
1205  sizeof(struct ocfs2_refcount_rec) *
1206  (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1207 
1208  memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1209  0, sizeof(struct ocfs2_refcount_rec));
1210  le16_add_cpu(&rb->rf_records.rl_used, -1);
1211 }
1212 
1213 /*
1214  * Merge the refcount rec if we are contiguous with the adjacent recs.
1215  */
1216 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1217  int index)
1218 {
1219  enum ocfs2_ref_rec_contig contig =
1220  ocfs2_refcount_rec_contig(rb, index);
1221 
1222  if (contig == REF_CONTIG_NONE)
1223  return;
1224 
1225  if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1226  BUG_ON(index == 0);
1227  index--;
1228  }
1229 
1230  ocfs2_rotate_refcount_rec_left(rb, index);
1231 
1232  if (contig == REF_CONTIG_LEFTRIGHT)
1233  ocfs2_rotate_refcount_rec_left(rb, index);
1234 }
1235 
1236 /*
1237  * Change the refcount indexed by "index" in ref_bh.
1238  * If refcount reaches 0, remove it.
1239  */
1240 static int ocfs2_change_refcount_rec(handle_t *handle,
1241  struct ocfs2_caching_info *ci,
1242  struct buffer_head *ref_leaf_bh,
1243  int index, int merge, int change)
1244 {
1245  int ret;
1246  struct ocfs2_refcount_block *rb =
1247  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1248  struct ocfs2_refcount_list *rl = &rb->rf_records;
1249  struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1250 
1251  ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1253  if (ret) {
1254  mlog_errno(ret);
1255  goto out;
1256  }
1257 
1258  trace_ocfs2_change_refcount_rec(
1259  (unsigned long long)ocfs2_metadata_cache_owner(ci),
1260  index, le32_to_cpu(rec->r_refcount), change);
1261  le32_add_cpu(&rec->r_refcount, change);
1262 
1263  if (!rec->r_refcount) {
1264  if (index != le16_to_cpu(rl->rl_used) - 1) {
1265  memmove(rec, rec + 1,
1266  (le16_to_cpu(rl->rl_used) - index - 1) *
1267  sizeof(struct ocfs2_refcount_rec));
1268  memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1269  0, sizeof(struct ocfs2_refcount_rec));
1270  }
1271 
1272  le16_add_cpu(&rl->rl_used, -1);
1273  } else if (merge)
1274  ocfs2_refcount_rec_merge(rb, index);
1275 
1276  ocfs2_journal_dirty(handle, ref_leaf_bh);
1277 out:
1278  return ret;
1279 }
1280 
1281 static int ocfs2_expand_inline_ref_root(handle_t *handle,
1282  struct ocfs2_caching_info *ci,
1283  struct buffer_head *ref_root_bh,
1284  struct buffer_head **ref_leaf_bh,
1285  struct ocfs2_alloc_context *meta_ac)
1286 {
1287  int ret;
1288  u16 suballoc_bit_start;
1289  u32 num_got;
1290  u64 suballoc_loc, blkno;
1292  struct buffer_head *new_bh = NULL;
1293  struct ocfs2_refcount_block *new_rb;
1294  struct ocfs2_refcount_block *root_rb =
1295  (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1296 
1297  ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1299  if (ret) {
1300  mlog_errno(ret);
1301  goto out;
1302  }
1303 
1304  ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1305  &suballoc_bit_start, &num_got,
1306  &blkno);
1307  if (ret) {
1308  mlog_errno(ret);
1309  goto out;
1310  }
1311 
1312  new_bh = sb_getblk(sb, blkno);
1313  if (new_bh == NULL) {
1314  ret = -EIO;
1315  mlog_errno(ret);
1316  goto out;
1317  }
1318  ocfs2_set_new_buffer_uptodate(ci, new_bh);
1319 
1320  ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1322  if (ret) {
1323  mlog_errno(ret);
1324  goto out;
1325  }
1326 
1327  /*
1328  * Initialize ocfs2_refcount_block.
1329  * It should contain the same information as the old root.
1330  * so just memcpy it and change the corresponding field.
1331  */
1332  memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1333 
1334  new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1335  new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1336  new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1337  new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1338  new_rb->rf_blkno = cpu_to_le64(blkno);
1339  new_rb->rf_cpos = cpu_to_le32(0);
1340  new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1342  ocfs2_journal_dirty(handle, new_bh);
1343 
1344  /* Now change the root. */
1345  memset(&root_rb->rf_list, 0, sb->s_blocksize -
1347  root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1348  root_rb->rf_clusters = cpu_to_le32(1);
1349  root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1350  root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1351  root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1353 
1354  ocfs2_journal_dirty(handle, ref_root_bh);
1355 
1356  trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
1357  le16_to_cpu(new_rb->rf_records.rl_used));
1358 
1359  *ref_leaf_bh = new_bh;
1360  new_bh = NULL;
1361 out:
1362  brelse(new_bh);
1363  return ret;
1364 }
1365 
1366 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1367  struct ocfs2_refcount_rec *next)
1368 {
1369  if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1370  ocfs2_get_ref_rec_low_cpos(next))
1371  return 1;
1372 
1373  return 0;
1374 }
1375 
1376 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1377 {
1378  const struct ocfs2_refcount_rec *l = a, *r = b;
1379  u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1380  u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1381 
1382  if (l_cpos > r_cpos)
1383  return 1;
1384  if (l_cpos < r_cpos)
1385  return -1;
1386  return 0;
1387 }
1388 
1389 static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1390 {
1391  const struct ocfs2_refcount_rec *l = a, *r = b;
1392  u64 l_cpos = le64_to_cpu(l->r_cpos);
1393  u64 r_cpos = le64_to_cpu(r->r_cpos);
1394 
1395  if (l_cpos > r_cpos)
1396  return 1;
1397  if (l_cpos < r_cpos)
1398  return -1;
1399  return 0;
1400 }
1401 
1402 static void swap_refcount_rec(void *a, void *b, int size)
1403 {
1404  struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1405 
1406  tmp = *(struct ocfs2_refcount_rec *)l;
1407  *(struct ocfs2_refcount_rec *)l =
1408  *(struct ocfs2_refcount_rec *)r;
1409  *(struct ocfs2_refcount_rec *)r = tmp;
1410 }
1411 
1412 /*
1413  * The refcount cpos are ordered by their 64bit cpos,
1414  * But we will use the low 32 bit to be the e_cpos in the b-tree.
1415  * So we need to make sure that this pos isn't intersected with others.
1416  *
1417  * Note: The refcount block is already sorted by their low 32 bit cpos,
1418  * So just try the middle pos first, and we will exit when we find
1419  * the good position.
1420  */
1421 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1422  u32 *split_pos, int *split_index)
1423 {
1424  int num_used = le16_to_cpu(rl->rl_used);
1425  int delta, middle = num_used / 2;
1426 
1427  for (delta = 0; delta < middle; delta++) {
1428  /* Let's check delta earlier than middle */
1429  if (ocfs2_refcount_rec_no_intersect(
1430  &rl->rl_recs[middle - delta - 1],
1431  &rl->rl_recs[middle - delta])) {
1432  *split_index = middle - delta;
1433  break;
1434  }
1435 
1436  /* For even counts, don't walk off the end */
1437  if ((middle + delta + 1) == num_used)
1438  continue;
1439 
1440  /* Now try delta past middle */
1441  if (ocfs2_refcount_rec_no_intersect(
1442  &rl->rl_recs[middle + delta],
1443  &rl->rl_recs[middle + delta + 1])) {
1444  *split_index = middle + delta + 1;
1445  break;
1446  }
1447  }
1448 
1449  if (delta >= middle)
1450  return -ENOSPC;
1451 
1452  *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1453  return 0;
1454 }
1455 
1456 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1457  struct buffer_head *new_bh,
1458  u32 *split_cpos)
1459 {
1460  int split_index = 0, num_moved, ret;
1461  u32 cpos = 0;
1462  struct ocfs2_refcount_block *rb =
1463  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1464  struct ocfs2_refcount_list *rl = &rb->rf_records;
1465  struct ocfs2_refcount_block *new_rb =
1466  (struct ocfs2_refcount_block *)new_bh->b_data;
1467  struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1468 
1469  trace_ocfs2_divide_leaf_refcount_block(
1470  (unsigned long long)ref_leaf_bh->b_blocknr,
1472 
1473  /*
1474  * XXX: Improvement later.
1475  * If we know all the high 32 bit cpos is the same, no need to sort.
1476  *
1477  * In order to make the whole process safe, we do:
1478  * 1. sort the entries by their low 32 bit cpos first so that we can
1479  * find the split cpos easily.
1480  * 2. call ocfs2_insert_extent to insert the new refcount block.
1481  * 3. move the refcount rec to the new block.
1482  * 4. sort the entries by their 64 bit cpos.
1483  * 5. dirty the new_rb and rb.
1484  */
1485  sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1486  sizeof(struct ocfs2_refcount_rec),
1487  cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1488 
1489  ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1490  if (ret) {
1491  mlog_errno(ret);
1492  return ret;
1493  }
1494 
1495  new_rb->rf_cpos = cpu_to_le32(cpos);
1496 
1497  /* move refcount records starting from split_index to the new block. */
1498  num_moved = le16_to_cpu(rl->rl_used) - split_index;
1499  memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1500  num_moved * sizeof(struct ocfs2_refcount_rec));
1501 
1502  /*ok, remove the entries we just moved over to the other block. */
1503  memset(&rl->rl_recs[split_index], 0,
1504  num_moved * sizeof(struct ocfs2_refcount_rec));
1505 
1506  /* change old and new rl_used accordingly. */
1507  le16_add_cpu(&rl->rl_used, -num_moved);
1508  new_rl->rl_used = cpu_to_le16(num_moved);
1509 
1510  sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1511  sizeof(struct ocfs2_refcount_rec),
1512  cmp_refcount_rec_by_cpos, swap_refcount_rec);
1513 
1514  sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1515  sizeof(struct ocfs2_refcount_rec),
1516  cmp_refcount_rec_by_cpos, swap_refcount_rec);
1517 
1518  *split_cpos = cpos;
1519  return 0;
1520 }
1521 
1522 static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1523  struct ocfs2_caching_info *ci,
1524  struct buffer_head *ref_root_bh,
1525  struct buffer_head *ref_leaf_bh,
1526  struct ocfs2_alloc_context *meta_ac)
1527 {
1528  int ret;
1529  u16 suballoc_bit_start;
1530  u32 num_got, new_cpos;
1531  u64 suballoc_loc, blkno;
1533  struct ocfs2_refcount_block *root_rb =
1534  (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1535  struct buffer_head *new_bh = NULL;
1536  struct ocfs2_refcount_block *new_rb;
1537  struct ocfs2_extent_tree ref_et;
1538 
1540 
1541  ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1543  if (ret) {
1544  mlog_errno(ret);
1545  goto out;
1546  }
1547 
1548  ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1550  if (ret) {
1551  mlog_errno(ret);
1552  goto out;
1553  }
1554 
1555  ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1556  &suballoc_bit_start, &num_got,
1557  &blkno);
1558  if (ret) {
1559  mlog_errno(ret);
1560  goto out;
1561  }
1562 
1563  new_bh = sb_getblk(sb, blkno);
1564  if (new_bh == NULL) {
1565  ret = -EIO;
1566  mlog_errno(ret);
1567  goto out;
1568  }
1569  ocfs2_set_new_buffer_uptodate(ci, new_bh);
1570 
1571  ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1573  if (ret) {
1574  mlog_errno(ret);
1575  goto out;
1576  }
1577 
1578  /* Initialize ocfs2_refcount_block. */
1579  new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1580  memset(new_rb, 0, sb->s_blocksize);
1581  strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1582  new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1583  new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1584  new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1586  new_rb->rf_blkno = cpu_to_le64(blkno);
1587  new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1589  new_rb->rf_records.rl_count =
1590  cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1591  new_rb->rf_generation = root_rb->rf_generation;
1592 
1593  ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1594  if (ret) {
1595  mlog_errno(ret);
1596  goto out;
1597  }
1598 
1599  ocfs2_journal_dirty(handle, ref_leaf_bh);
1600  ocfs2_journal_dirty(handle, new_bh);
1601 
1602  ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1603 
1604  trace_ocfs2_new_leaf_refcount_block(
1605  (unsigned long long)new_bh->b_blocknr, new_cpos);
1606 
1607  /* Insert the new leaf block with the specific offset cpos. */
1608  ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1609  1, 0, meta_ac);
1610  if (ret)
1611  mlog_errno(ret);
1612 
1613 out:
1614  brelse(new_bh);
1615  return ret;
1616 }
1617 
1618 static int ocfs2_expand_refcount_tree(handle_t *handle,
1619  struct ocfs2_caching_info *ci,
1620  struct buffer_head *ref_root_bh,
1621  struct buffer_head *ref_leaf_bh,
1622  struct ocfs2_alloc_context *meta_ac)
1623 {
1624  int ret;
1625  struct buffer_head *expand_bh = NULL;
1626 
1627  if (ref_root_bh == ref_leaf_bh) {
1628  /*
1629  * the old root bh hasn't been expanded to a b-tree,
1630  * so expand it first.
1631  */
1632  ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1633  &expand_bh, meta_ac);
1634  if (ret) {
1635  mlog_errno(ret);
1636  goto out;
1637  }
1638  } else {
1639  expand_bh = ref_leaf_bh;
1640  get_bh(expand_bh);
1641  }
1642 
1643 
1644  /* Now add a new refcount block into the tree.*/
1645  ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1646  expand_bh, meta_ac);
1647  if (ret)
1648  mlog_errno(ret);
1649 out:
1650  brelse(expand_bh);
1651  return ret;
1652 }
1653 
1654 /*
1655  * Adjust the extent rec in b-tree representing ref_leaf_bh.
1656  *
1657  * Only called when we have inserted a new refcount rec at index 0
1658  * which means ocfs2_extent_rec.e_cpos may need some change.
1659  */
1660 static int ocfs2_adjust_refcount_rec(handle_t *handle,
1661  struct ocfs2_caching_info *ci,
1662  struct buffer_head *ref_root_bh,
1663  struct buffer_head *ref_leaf_bh,
1664  struct ocfs2_refcount_rec *rec)
1665 {
1666  int ret = 0, i;
1667  u32 new_cpos, old_cpos;
1668  struct ocfs2_path *path = NULL;
1669  struct ocfs2_extent_tree et;
1670  struct ocfs2_refcount_block *rb =
1671  (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1672  struct ocfs2_extent_list *el;
1673 
1675  goto out;
1676 
1677  rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1678  old_cpos = le32_to_cpu(rb->rf_cpos);
1679  new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1680  if (old_cpos <= new_cpos)
1681  goto out;
1682 
1683  ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1684 
1685  path = ocfs2_new_path_from_et(&et);
1686  if (!path) {
1687  ret = -ENOMEM;
1688  mlog_errno(ret);
1689  goto out;
1690  }
1691 
1692  ret = ocfs2_find_path(ci, path, old_cpos);
1693  if (ret) {
1694  mlog_errno(ret);
1695  goto out;
1696  }
1697 
1698  /*
1699  * 2 more credits, one for the leaf refcount block, one for
1700  * the extent block contains the extent rec.
1701  */
1702  ret = ocfs2_extend_trans(handle, 2);
1703  if (ret < 0) {
1704  mlog_errno(ret);
1705  goto out;
1706  }
1707 
1708  ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1710  if (ret < 0) {
1711  mlog_errno(ret);
1712  goto out;
1713  }
1714 
1715  ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1717  if (ret < 0) {
1718  mlog_errno(ret);
1719  goto out;
1720  }
1721 
1722  /* change the leaf extent block first. */
1723  el = path_leaf_el(path);
1724 
1725  for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1726  if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1727  break;
1728 
1729  BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1730 
1731  el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1732 
1733  /* change the r_cpos in the leaf block. */
1734  rb->rf_cpos = cpu_to_le32(new_cpos);
1735 
1736  ocfs2_journal_dirty(handle, path_leaf_bh(path));
1737  ocfs2_journal_dirty(handle, ref_leaf_bh);
1738 
1739 out:
1740  ocfs2_free_path(path);
1741  return ret;
1742 }
1743 
1744 static int ocfs2_insert_refcount_rec(handle_t *handle,
1745  struct ocfs2_caching_info *ci,
1746  struct buffer_head *ref_root_bh,
1747  struct buffer_head *ref_leaf_bh,
1748  struct ocfs2_refcount_rec *rec,
1749  int index, int merge,
1750  struct ocfs2_alloc_context *meta_ac)
1751 {
1752  int ret;
1753  struct ocfs2_refcount_block *rb =
1754  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1755  struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1756  struct buffer_head *new_bh = NULL;
1757 
1759 
1760  if (rf_list->rl_used == rf_list->rl_count) {
1761  u64 cpos = le64_to_cpu(rec->r_cpos);
1762  u32 len = le32_to_cpu(rec->r_clusters);
1763 
1764  ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1765  ref_leaf_bh, meta_ac);
1766  if (ret) {
1767  mlog_errno(ret);
1768  goto out;
1769  }
1770 
1771  ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1772  cpos, len, NULL, &index,
1773  &new_bh);
1774  if (ret) {
1775  mlog_errno(ret);
1776  goto out;
1777  }
1778 
1779  ref_leaf_bh = new_bh;
1780  rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1781  rf_list = &rb->rf_records;
1782  }
1783 
1784  ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1786  if (ret) {
1787  mlog_errno(ret);
1788  goto out;
1789  }
1790 
1791  if (index < le16_to_cpu(rf_list->rl_used))
1792  memmove(&rf_list->rl_recs[index + 1],
1793  &rf_list->rl_recs[index],
1794  (le16_to_cpu(rf_list->rl_used) - index) *
1795  sizeof(struct ocfs2_refcount_rec));
1796 
1797  trace_ocfs2_insert_refcount_rec(
1798  (unsigned long long)ref_leaf_bh->b_blocknr, index,
1799  (unsigned long long)le64_to_cpu(rec->r_cpos),
1801 
1802  rf_list->rl_recs[index] = *rec;
1803 
1804  le16_add_cpu(&rf_list->rl_used, 1);
1805 
1806  if (merge)
1807  ocfs2_refcount_rec_merge(rb, index);
1808 
1809  ocfs2_journal_dirty(handle, ref_leaf_bh);
1810 
1811  if (index == 0) {
1812  ret = ocfs2_adjust_refcount_rec(handle, ci,
1813  ref_root_bh,
1814  ref_leaf_bh, rec);
1815  if (ret)
1816  mlog_errno(ret);
1817  }
1818 out:
1819  brelse(new_bh);
1820  return ret;
1821 }
1822 
1823 /*
1824  * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1825  * This is much simple than our b-tree code.
1826  * split_rec is the new refcount rec we want to insert.
1827  * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1828  * increase refcount or decrease a refcount to non-zero).
1829  * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1830  * rec( in case we decrease a refcount to zero).
1831  */
1832 static int ocfs2_split_refcount_rec(handle_t *handle,
1833  struct ocfs2_caching_info *ci,
1834  struct buffer_head *ref_root_bh,
1835  struct buffer_head *ref_leaf_bh,
1836  struct ocfs2_refcount_rec *split_rec,
1837  int index, int merge,
1838  struct ocfs2_alloc_context *meta_ac,
1839  struct ocfs2_cached_dealloc_ctxt *dealloc)
1840 {
1841  int ret, recs_need;
1842  u32 len;
1843  struct ocfs2_refcount_block *rb =
1844  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1845  struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1846  struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1847  struct ocfs2_refcount_rec *tail_rec = NULL;
1848  struct buffer_head *new_bh = NULL;
1849 
1851 
1852  trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
1853  le32_to_cpu(orig_rec->r_clusters),
1854  le32_to_cpu(orig_rec->r_refcount),
1855  le64_to_cpu(split_rec->r_cpos),
1856  le32_to_cpu(split_rec->r_clusters),
1857  le32_to_cpu(split_rec->r_refcount));
1858 
1859  /*
1860  * If we just need to split the header or tail clusters,
1861  * no more recs are needed, just split is OK.
1862  * Otherwise we at least need one new recs.
1863  */
1864  if (!split_rec->r_refcount &&
1865  (split_rec->r_cpos == orig_rec->r_cpos ||
1866  le64_to_cpu(split_rec->r_cpos) +
1867  le32_to_cpu(split_rec->r_clusters) ==
1868  le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1869  recs_need = 0;
1870  else
1871  recs_need = 1;
1872 
1873  /*
1874  * We need one more rec if we split in the middle and the new rec have
1875  * some refcount in it.
1876  */
1877  if (split_rec->r_refcount &&
1878  (split_rec->r_cpos != orig_rec->r_cpos &&
1879  le64_to_cpu(split_rec->r_cpos) +
1880  le32_to_cpu(split_rec->r_clusters) !=
1881  le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1882  recs_need++;
1883 
1884  /* If the leaf block don't have enough record, expand it. */
1885  if (le16_to_cpu(rf_list->rl_used) + recs_need >
1886  le16_to_cpu(rf_list->rl_count)) {
1887  struct ocfs2_refcount_rec tmp_rec;
1888  u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1889  len = le32_to_cpu(orig_rec->r_clusters);
1890  ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1891  ref_leaf_bh, meta_ac);
1892  if (ret) {
1893  mlog_errno(ret);
1894  goto out;
1895  }
1896 
1897  /*
1898  * We have to re-get it since now cpos may be moved to
1899  * another leaf block.
1900  */
1901  ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1902  cpos, len, &tmp_rec, &index,
1903  &new_bh);
1904  if (ret) {
1905  mlog_errno(ret);
1906  goto out;
1907  }
1908 
1909  ref_leaf_bh = new_bh;
1910  rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1911  rf_list = &rb->rf_records;
1912  orig_rec = &rf_list->rl_recs[index];
1913  }
1914 
1915  ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1917  if (ret) {
1918  mlog_errno(ret);
1919  goto out;
1920  }
1921 
1922  /*
1923  * We have calculated out how many new records we need and store
1924  * in recs_need, so spare enough space first by moving the records
1925  * after "index" to the end.
1926  */
1927  if (index != le16_to_cpu(rf_list->rl_used) - 1)
1928  memmove(&rf_list->rl_recs[index + 1 + recs_need],
1929  &rf_list->rl_recs[index + 1],
1930  (le16_to_cpu(rf_list->rl_used) - index - 1) *
1931  sizeof(struct ocfs2_refcount_rec));
1932 
1933  len = (le64_to_cpu(orig_rec->r_cpos) +
1934  le32_to_cpu(orig_rec->r_clusters)) -
1935  (le64_to_cpu(split_rec->r_cpos) +
1936  le32_to_cpu(split_rec->r_clusters));
1937 
1938  /*
1939  * If we have "len", the we will split in the tail and move it
1940  * to the end of the space we have just spared.
1941  */
1942  if (len) {
1943  tail_rec = &rf_list->rl_recs[index + recs_need];
1944 
1945  memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1946  le64_add_cpu(&tail_rec->r_cpos,
1947  le32_to_cpu(tail_rec->r_clusters) - len);
1948  tail_rec->r_clusters = cpu_to_le32(len);
1949  }
1950 
1951  /*
1952  * If the split pos isn't the same as the original one, we need to
1953  * split in the head.
1954  *
1955  * Note: We have the chance that split_rec.r_refcount = 0,
1956  * recs_need = 0 and len > 0, which means we just cut the head from
1957  * the orig_rec and in that case we have done some modification in
1958  * orig_rec above, so the check for r_cpos is faked.
1959  */
1960  if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1961  len = le64_to_cpu(split_rec->r_cpos) -
1962  le64_to_cpu(orig_rec->r_cpos);
1963  orig_rec->r_clusters = cpu_to_le32(len);
1964  index++;
1965  }
1966 
1967  le16_add_cpu(&rf_list->rl_used, recs_need);
1968 
1969  if (split_rec->r_refcount) {
1970  rf_list->rl_recs[index] = *split_rec;
1971  trace_ocfs2_split_refcount_rec_insert(
1972  (unsigned long long)ref_leaf_bh->b_blocknr, index,
1973  (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1974  le32_to_cpu(split_rec->r_clusters),
1975  le32_to_cpu(split_rec->r_refcount));
1976 
1977  if (merge)
1978  ocfs2_refcount_rec_merge(rb, index);
1979  }
1980 
1981  ocfs2_journal_dirty(handle, ref_leaf_bh);
1982 
1983 out:
1984  brelse(new_bh);
1985  return ret;
1986 }
1987 
1988 static int __ocfs2_increase_refcount(handle_t *handle,
1989  struct ocfs2_caching_info *ci,
1990  struct buffer_head *ref_root_bh,
1991  u64 cpos, u32 len, int merge,
1992  struct ocfs2_alloc_context *meta_ac,
1993  struct ocfs2_cached_dealloc_ctxt *dealloc)
1994 {
1995  int ret = 0, index;
1996  struct buffer_head *ref_leaf_bh = NULL;
1997  struct ocfs2_refcount_rec rec;
1998  unsigned int set_len = 0;
1999 
2000  trace_ocfs2_increase_refcount_begin(
2001  (unsigned long long)ocfs2_metadata_cache_owner(ci),
2002  (unsigned long long)cpos, len);
2003 
2004  while (len) {
2005  ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2006  cpos, len, &rec, &index,
2007  &ref_leaf_bh);
2008  if (ret) {
2009  mlog_errno(ret);
2010  goto out;
2011  }
2012 
2013  set_len = le32_to_cpu(rec.r_clusters);
2014 
2015  /*
2016  * Here we may meet with 3 situations:
2017  *
2018  * 1. If we find an already existing record, and the length
2019  * is the same, cool, we just need to increase the r_refcount
2020  * and it is OK.
2021  * 2. If we find a hole, just insert it with r_refcount = 1.
2022  * 3. If we are in the middle of one extent record, split
2023  * it.
2024  */
2025  if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2026  set_len <= len) {
2027  trace_ocfs2_increase_refcount_change(
2028  (unsigned long long)cpos, set_len,
2029  le32_to_cpu(rec.r_refcount));
2030  ret = ocfs2_change_refcount_rec(handle, ci,
2031  ref_leaf_bh, index,
2032  merge, 1);
2033  if (ret) {
2034  mlog_errno(ret);
2035  goto out;
2036  }
2037  } else if (!rec.r_refcount) {
2038  rec.r_refcount = cpu_to_le32(1);
2039 
2040  trace_ocfs2_increase_refcount_insert(
2041  (unsigned long long)le64_to_cpu(rec.r_cpos),
2042  set_len);
2043  ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2044  ref_leaf_bh,
2045  &rec, index,
2046  merge, meta_ac);
2047  if (ret) {
2048  mlog_errno(ret);
2049  goto out;
2050  }
2051  } else {
2052  set_len = min((u64)(cpos + len),
2053  le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2054  rec.r_cpos = cpu_to_le64(cpos);
2055  rec.r_clusters = cpu_to_le32(set_len);
2056  le32_add_cpu(&rec.r_refcount, 1);
2057 
2058  trace_ocfs2_increase_refcount_split(
2059  (unsigned long long)le64_to_cpu(rec.r_cpos),
2060  set_len, le32_to_cpu(rec.r_refcount));
2061  ret = ocfs2_split_refcount_rec(handle, ci,
2062  ref_root_bh, ref_leaf_bh,
2063  &rec, index, merge,
2064  meta_ac, dealloc);
2065  if (ret) {
2066  mlog_errno(ret);
2067  goto out;
2068  }
2069  }
2070 
2071  cpos += set_len;
2072  len -= set_len;
2073  brelse(ref_leaf_bh);
2074  ref_leaf_bh = NULL;
2075  }
2076 
2077 out:
2078  brelse(ref_leaf_bh);
2079  return ret;
2080 }
2081 
2082 static int ocfs2_remove_refcount_extent(handle_t *handle,
2083  struct ocfs2_caching_info *ci,
2084  struct buffer_head *ref_root_bh,
2085  struct buffer_head *ref_leaf_bh,
2086  struct ocfs2_alloc_context *meta_ac,
2087  struct ocfs2_cached_dealloc_ctxt *dealloc)
2088 {
2089  int ret;
2091  struct ocfs2_refcount_block *rb =
2092  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2093  struct ocfs2_extent_tree et;
2094 
2095  BUG_ON(rb->rf_records.rl_used);
2096 
2097  trace_ocfs2_remove_refcount_extent(
2098  (unsigned long long)ocfs2_metadata_cache_owner(ci),
2099  (unsigned long long)ref_leaf_bh->b_blocknr,
2100  le32_to_cpu(rb->rf_cpos));
2101 
2102  ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2103  ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2104  1, meta_ac, dealloc);
2105  if (ret) {
2106  mlog_errno(ret);
2107  goto out;
2108  }
2109 
2110  ocfs2_remove_from_cache(ci, ref_leaf_bh);
2111 
2112  /*
2113  * add the freed block to the dealloc so that it will be freed
2114  * when we run dealloc.
2115  */
2119  le64_to_cpu(rb->rf_blkno),
2121  if (ret) {
2122  mlog_errno(ret);
2123  goto out;
2124  }
2125 
2126  ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2128  if (ret) {
2129  mlog_errno(ret);
2130  goto out;
2131  }
2132 
2133  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2134 
2135  le32_add_cpu(&rb->rf_clusters, -1);
2136 
2137  /*
2138  * check whether we need to restore the root refcount block if
2139  * there is no leaf extent block at atll.
2140  */
2141  if (!rb->rf_list.l_next_free_rec) {
2142  BUG_ON(rb->rf_clusters);
2143 
2144  trace_ocfs2_restore_refcount_block(
2145  (unsigned long long)ref_root_bh->b_blocknr);
2146 
2147  rb->rf_flags = 0;
2148  rb->rf_parent = 0;
2149  rb->rf_cpos = 0;
2150  memset(&rb->rf_records, 0, sb->s_blocksize -
2152  rb->rf_records.rl_count =
2153  cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2154  }
2155 
2156  ocfs2_journal_dirty(handle, ref_root_bh);
2157 
2158 out:
2159  return ret;
2160 }
2161 
2162 int ocfs2_increase_refcount(handle_t *handle,
2163  struct ocfs2_caching_info *ci,
2164  struct buffer_head *ref_root_bh,
2165  u64 cpos, u32 len,
2166  struct ocfs2_alloc_context *meta_ac,
2167  struct ocfs2_cached_dealloc_ctxt *dealloc)
2168 {
2169  return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2170  cpos, len, 1,
2171  meta_ac, dealloc);
2172 }
2173 
2174 static int ocfs2_decrease_refcount_rec(handle_t *handle,
2175  struct ocfs2_caching_info *ci,
2176  struct buffer_head *ref_root_bh,
2177  struct buffer_head *ref_leaf_bh,
2178  int index, u64 cpos, unsigned int len,
2179  struct ocfs2_alloc_context *meta_ac,
2180  struct ocfs2_cached_dealloc_ctxt *dealloc)
2181 {
2182  int ret;
2183  struct ocfs2_refcount_block *rb =
2184  (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2185  struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2186 
2187  BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2188  BUG_ON(cpos + len >
2189  le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2190 
2191  trace_ocfs2_decrease_refcount_rec(
2192  (unsigned long long)ocfs2_metadata_cache_owner(ci),
2193  (unsigned long long)cpos, len);
2194 
2195  if (cpos == le64_to_cpu(rec->r_cpos) &&
2196  len == le32_to_cpu(rec->r_clusters))
2197  ret = ocfs2_change_refcount_rec(handle, ci,
2198  ref_leaf_bh, index, 1, -1);
2199  else {
2200  struct ocfs2_refcount_rec split = *rec;
2201  split.r_cpos = cpu_to_le64(cpos);
2202  split.r_clusters = cpu_to_le32(len);
2203 
2204  le32_add_cpu(&split.r_refcount, -1);
2205 
2206  ret = ocfs2_split_refcount_rec(handle, ci,
2207  ref_root_bh, ref_leaf_bh,
2208  &split, index, 1,
2209  meta_ac, dealloc);
2210  }
2211 
2212  if (ret) {
2213  mlog_errno(ret);
2214  goto out;
2215  }
2216 
2217  /* Remove the leaf refcount block if it contains no refcount record. */
2218  if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2219  ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2220  ref_leaf_bh, meta_ac,
2221  dealloc);
2222  if (ret)
2223  mlog_errno(ret);
2224  }
2225 
2226 out:
2227  return ret;
2228 }
2229 
2230 static int __ocfs2_decrease_refcount(handle_t *handle,
2231  struct ocfs2_caching_info *ci,
2232  struct buffer_head *ref_root_bh,
2233  u64 cpos, u32 len,
2234  struct ocfs2_alloc_context *meta_ac,
2235  struct ocfs2_cached_dealloc_ctxt *dealloc,
2236  int delete)
2237 {
2238  int ret = 0, index = 0;
2239  struct ocfs2_refcount_rec rec;
2240  unsigned int r_count = 0, r_len;
2242  struct buffer_head *ref_leaf_bh = NULL;
2243 
2244  trace_ocfs2_decrease_refcount(
2245  (unsigned long long)ocfs2_metadata_cache_owner(ci),
2246  (unsigned long long)cpos, len, delete);
2247 
2248  while (len) {
2249  ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2250  cpos, len, &rec, &index,
2251  &ref_leaf_bh);
2252  if (ret) {
2253  mlog_errno(ret);
2254  goto out;
2255  }
2256 
2257  r_count = le32_to_cpu(rec.r_refcount);
2258  BUG_ON(r_count == 0);
2259  if (!delete)
2260  BUG_ON(r_count > 1);
2261 
2262  r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2263  le32_to_cpu(rec.r_clusters)) - cpos;
2264 
2265  ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2266  ref_leaf_bh, index,
2267  cpos, r_len,
2268  meta_ac, dealloc);
2269  if (ret) {
2270  mlog_errno(ret);
2271  goto out;
2272  }
2273 
2274  if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2275  ret = ocfs2_cache_cluster_dealloc(dealloc,
2276  ocfs2_clusters_to_blocks(sb, cpos),
2277  r_len);
2278  if (ret) {
2279  mlog_errno(ret);
2280  goto out;
2281  }
2282  }
2283 
2284  cpos += r_len;
2285  len -= r_len;
2286  brelse(ref_leaf_bh);
2287  ref_leaf_bh = NULL;
2288  }
2289 
2290 out:
2291  brelse(ref_leaf_bh);
2292  return ret;
2293 }
2294 
2295 /* Caller must hold refcount tree lock. */
2296 int ocfs2_decrease_refcount(struct inode *inode,
2297  handle_t *handle, u32 cpos, u32 len,
2298  struct ocfs2_alloc_context *meta_ac,
2299  struct ocfs2_cached_dealloc_ctxt *dealloc,
2300  int delete)
2301 {
2302  int ret;
2303  u64 ref_blkno;
2304  struct ocfs2_inode_info *oi = OCFS2_I(inode);
2305  struct buffer_head *ref_root_bh = NULL;
2306  struct ocfs2_refcount_tree *tree;
2307 
2309 
2310  ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2311  if (ret) {
2312  mlog_errno(ret);
2313  goto out;
2314  }
2315 
2316  ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2317  if (ret) {
2318  mlog_errno(ret);
2319  goto out;
2320  }
2321 
2322  ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2323  &ref_root_bh);
2324  if (ret) {
2325  mlog_errno(ret);
2326  goto out;
2327  }
2328 
2329  ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2330  cpos, len, meta_ac, dealloc, delete);
2331  if (ret)
2332  mlog_errno(ret);
2333 out:
2334  brelse(ref_root_bh);
2335  return ret;
2336 }
2337 
2338 /*
2339  * Mark the already-existing extent at cpos as refcounted for len clusters.
2340  * This adds the refcount extent flag.
2341  *
2342  * If the existing extent is larger than the request, initiate a
2343  * split. An attempt will be made at merging with adjacent extents.
2344  *
2345  * The caller is responsible for passing down meta_ac if we'll need it.
2346  */
2347 static int ocfs2_mark_extent_refcounted(struct inode *inode,
2348  struct ocfs2_extent_tree *et,
2349  handle_t *handle, u32 cpos,
2350  u32 len, u32 phys,
2351  struct ocfs2_alloc_context *meta_ac,
2352  struct ocfs2_cached_dealloc_ctxt *dealloc)
2353 {
2354  int ret;
2355 
2356  trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
2357  cpos, len, phys);
2358 
2359  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2360  ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2361  "tree, but the feature bit is not set in the "
2362  "super block.", inode->i_ino);
2363  ret = -EROFS;
2364  goto out;
2365  }
2366 
2367  ret = ocfs2_change_extent_flag(handle, et, cpos,
2368  len, phys, meta_ac, dealloc,
2370  if (ret)
2371  mlog_errno(ret);
2372 
2373 out:
2374  return ret;
2375 }
2376 
2377 /*
2378  * Given some contiguous physical clusters, calculate what we need
2379  * for modifying their refcount.
2380  */
2381 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2382  struct ocfs2_caching_info *ci,
2383  struct buffer_head *ref_root_bh,
2384  u64 start_cpos,
2385  u32 clusters,
2386  int *meta_add,
2387  int *credits)
2388 {
2389  int ret = 0, index, ref_blocks = 0, recs_add = 0;
2390  u64 cpos = start_cpos;
2391  struct ocfs2_refcount_block *rb;
2392  struct ocfs2_refcount_rec rec;
2393  struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2394  u32 len;
2395 
2396  while (clusters) {
2397  ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2398  cpos, clusters, &rec,
2399  &index, &ref_leaf_bh);
2400  if (ret) {
2401  mlog_errno(ret);
2402  goto out;
2403  }
2404 
2405  if (ref_leaf_bh != prev_bh) {
2406  /*
2407  * Now we encounter a new leaf block, so calculate
2408  * whether we need to extend the old leaf.
2409  */
2410  if (prev_bh) {
2411  rb = (struct ocfs2_refcount_block *)
2412  prev_bh->b_data;
2413 
2414  if (le16_to_cpu(rb->rf_records.rl_used) +
2415  recs_add >
2416  le16_to_cpu(rb->rf_records.rl_count))
2417  ref_blocks++;
2418  }
2419 
2420  recs_add = 0;
2421  *credits += 1;
2422  brelse(prev_bh);
2423  prev_bh = ref_leaf_bh;
2424  get_bh(prev_bh);
2425  }
2426 
2427  rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2428 
2429  trace_ocfs2_calc_refcount_meta_credits_iterate(
2430  recs_add, (unsigned long long)cpos, clusters,
2431  (unsigned long long)le64_to_cpu(rec.r_cpos),
2432  le32_to_cpu(rec.r_clusters),
2433  le32_to_cpu(rec.r_refcount), index);
2434 
2435  len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2436  le32_to_cpu(rec.r_clusters)) - cpos;
2437  /*
2438  * We record all the records which will be inserted to the
2439  * same refcount block, so that we can tell exactly whether
2440  * we need a new refcount block or not.
2441  *
2442  * If we will insert a new one, this is easy and only happens
2443  * during adding refcounted flag to the extent, so we don't
2444  * have a chance of spliting. We just need one record.
2445  *
2446  * If the refcount rec already exists, that would be a little
2447  * complicated. we may have to:
2448  * 1) split at the beginning if the start pos isn't aligned.
2449  * we need 1 more record in this case.
2450  * 2) split int the end if the end pos isn't aligned.
2451  * we need 1 more record in this case.
2452  * 3) split in the middle because of file system fragmentation.
2453  * we need 2 more records in this case(we can't detect this
2454  * beforehand, so always think of the worst case).
2455  */
2456  if (rec.r_refcount) {
2457  recs_add += 2;
2458  /* Check whether we need a split at the beginning. */
2459  if (cpos == start_cpos &&
2460  cpos != le64_to_cpu(rec.r_cpos))
2461  recs_add++;
2462 
2463  /* Check whether we need a split in the end. */
2464  if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2465  le32_to_cpu(rec.r_clusters))
2466  recs_add++;
2467  } else
2468  recs_add++;
2469 
2470  brelse(ref_leaf_bh);
2471  ref_leaf_bh = NULL;
2472  clusters -= len;
2473  cpos += len;
2474  }
2475 
2476  if (prev_bh) {
2477  rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2478 
2479  if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
2480  le16_to_cpu(rb->rf_records.rl_count))
2481  ref_blocks++;
2482 
2483  *credits += 1;
2484  }
2485 
2486  if (!ref_blocks)
2487  goto out;
2488 
2489  *meta_add += ref_blocks;
2490  *credits += ref_blocks;
2491 
2492  /*
2493  * So we may need ref_blocks to insert into the tree.
2494  * That also means we need to change the b-tree and add that number
2495  * of records since we never merge them.
2496  * We need one more block for expansion since the new created leaf
2497  * block is also full and needs split.
2498  */
2499  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2501  struct ocfs2_extent_tree et;
2502 
2503  ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2504  *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2505  *credits += ocfs2_calc_extend_credits(sb,
2506  et.et_root_el,
2507  ref_blocks);
2508  } else {
2510  *meta_add += 1;
2511  }
2512 
2513 out:
2514 
2515  trace_ocfs2_calc_refcount_meta_credits(
2516  (unsigned long long)start_cpos, clusters,
2517  *meta_add, *credits);
2518  brelse(ref_leaf_bh);
2519  brelse(prev_bh);
2520  return ret;
2521 }
2522 
2523 /*
2524  * For refcount tree, we will decrease some contiguous clusters
2525  * refcount count, so just go through it to see how many blocks
2526  * we gonna touch and whether we need to create new blocks.
2527  *
2528  * Normally the refcount blocks store these refcount should be
2529  * contiguous also, so that we can get the number easily.
2530  * We will at most add split 2 refcount records and 2 more
2531  * refcount blocks, so just check it in a rough way.
2532  *
2533  * Caller must hold refcount tree lock.
2534  */
2536  u64 refcount_loc,
2537  u64 phys_blkno,
2538  u32 clusters,
2539  int *credits,
2540  int *ref_blocks)
2541 {
2542  int ret;
2543  struct ocfs2_inode_info *oi = OCFS2_I(inode);
2544  struct buffer_head *ref_root_bh = NULL;
2545  struct ocfs2_refcount_tree *tree;
2546  u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2547 
2548  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2549  ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2550  "tree, but the feature bit is not set in the "
2551  "super block.", inode->i_ino);
2552  ret = -EROFS;
2553  goto out;
2554  }
2555 
2557 
2558  ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2559  refcount_loc, &tree);
2560  if (ret) {
2561  mlog_errno(ret);
2562  goto out;
2563  }
2564 
2565  ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2566  &ref_root_bh);
2567  if (ret) {
2568  mlog_errno(ret);
2569  goto out;
2570  }
2571 
2572  ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2573  &tree->rf_ci,
2574  ref_root_bh,
2575  start_cpos, clusters,
2576  ref_blocks, credits);
2577  if (ret) {
2578  mlog_errno(ret);
2579  goto out;
2580  }
2581 
2582  trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
2583 
2584 out:
2585  brelse(ref_root_bh);
2586  return ret;
2587 }
2588 
2589 #define MAX_CONTIG_BYTES 1048576
2590 
2591 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2592 {
2593  return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2594 }
2595 
2596 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2597 {
2598  return ~(ocfs2_cow_contig_clusters(sb) - 1);
2599 }
2600 
2601 /*
2602  * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2603  * find an offset (start + (n * contig_clusters)) that is closest to cpos
2604  * while still being less than or equal to it.
2605  *
2606  * The goal is to break the extent at a multiple of contig_clusters.
2607  */
2608 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2609  unsigned int start,
2610  unsigned int cpos)
2611 {
2612  BUG_ON(start > cpos);
2613 
2614  return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2615 }
2616 
2617 /*
2618  * Given a cluster count of len, pad it out so that it is a multiple
2619  * of contig_clusters.
2620  */
2621 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2622  unsigned int len)
2623 {
2624  unsigned int padded =
2625  (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2626  ocfs2_cow_contig_mask(sb);
2627 
2628  /* Did we wrap? */
2629  if (padded < len)
2630  padded = UINT_MAX;
2631 
2632  return padded;
2633 }
2634 
2635 /*
2636  * Calculate out the start and number of virtual clusters we need to to CoW.
2637  *
2638  * cpos is vitual start cluster position we want to do CoW in a
2639  * file and write_len is the cluster length.
2640  * max_cpos is the place where we want to stop CoW intentionally.
2641  *
2642  * Normal we will start CoW from the beginning of extent record cotaining cpos.
2643  * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2644  * get good I/O from the resulting extent tree.
2645  */
2646 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2647  struct ocfs2_extent_list *el,
2648  u32 cpos,
2649  u32 write_len,
2650  u32 max_cpos,
2651  u32 *cow_start,
2652  u32 *cow_len)
2653 {
2654  int ret = 0;
2655  int tree_height = le16_to_cpu(el->l_tree_depth), i;
2656  struct buffer_head *eb_bh = NULL;
2657  struct ocfs2_extent_block *eb = NULL;
2658  struct ocfs2_extent_rec *rec;
2659  unsigned int want_clusters, rec_end = 0;
2660  int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2661  int leaf_clusters;
2662 
2663  BUG_ON(cpos + write_len > max_cpos);
2664 
2665  if (tree_height > 0) {
2666  ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2667  if (ret) {
2668  mlog_errno(ret);
2669  goto out;
2670  }
2671 
2672  eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2673  el = &eb->h_list;
2674 
2675  if (el->l_tree_depth) {
2676  ocfs2_error(inode->i_sb,
2677  "Inode %lu has non zero tree depth in "
2678  "leaf block %llu\n", inode->i_ino,
2679  (unsigned long long)eb_bh->b_blocknr);
2680  ret = -EROFS;
2681  goto out;
2682  }
2683  }
2684 
2685  *cow_len = 0;
2686  for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2687  rec = &el->l_recs[i];
2688 
2689  if (ocfs2_is_empty_extent(rec)) {
2690  mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2691  "index %d\n", inode->i_ino, i);
2692  continue;
2693  }
2694 
2695  if (le32_to_cpu(rec->e_cpos) +
2696  le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2697  continue;
2698 
2699  if (*cow_len == 0) {
2700  /*
2701  * We should find a refcounted record in the
2702  * first pass.
2703  */
2704  BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2705  *cow_start = le32_to_cpu(rec->e_cpos);
2706  }
2707 
2708  /*
2709  * If we encounter a hole, a non-refcounted record or
2710  * pass the max_cpos, stop the search.
2711  */
2712  if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2713  (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2714  (max_cpos <= le32_to_cpu(rec->e_cpos)))
2715  break;
2716 
2717  leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2718  rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2719  if (rec_end > max_cpos) {
2720  rec_end = max_cpos;
2721  leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2722  }
2723 
2724  /*
2725  * How many clusters do we actually need from
2726  * this extent? First we see how many we actually
2727  * need to complete the write. If that's smaller
2728  * than contig_clusters, we try for contig_clusters.
2729  */
2730  if (!*cow_len)
2731  want_clusters = write_len;
2732  else
2733  want_clusters = (cpos + write_len) -
2734  (*cow_start + *cow_len);
2735  if (want_clusters < contig_clusters)
2736  want_clusters = contig_clusters;
2737 
2738  /*
2739  * If the write does not cover the whole extent, we
2740  * need to calculate how we're going to split the extent.
2741  * We try to do it on contig_clusters boundaries.
2742  *
2743  * Any extent smaller than contig_clusters will be
2744  * CoWed in its entirety.
2745  */
2746  if (leaf_clusters <= contig_clusters)
2747  *cow_len += leaf_clusters;
2748  else if (*cow_len || (*cow_start == cpos)) {
2749  /*
2750  * This extent needs to be CoW'd from its
2751  * beginning, so all we have to do is compute
2752  * how many clusters to grab. We align
2753  * want_clusters to the edge of contig_clusters
2754  * to get better I/O.
2755  */
2756  want_clusters = ocfs2_cow_align_length(inode->i_sb,
2757  want_clusters);
2758 
2759  if (leaf_clusters < want_clusters)
2760  *cow_len += leaf_clusters;
2761  else
2762  *cow_len += want_clusters;
2763  } else if ((*cow_start + contig_clusters) >=
2764  (cpos + write_len)) {
2765  /*
2766  * Breaking off contig_clusters at the front
2767  * of the extent will cover our write. That's
2768  * easy.
2769  */
2770  *cow_len = contig_clusters;
2771  } else if ((rec_end - cpos) <= contig_clusters) {
2772  /*
2773  * Breaking off contig_clusters at the tail of
2774  * this extent will cover cpos.
2775  */
2776  *cow_start = rec_end - contig_clusters;
2777  *cow_len = contig_clusters;
2778  } else if ((rec_end - cpos) <= want_clusters) {
2779  /*
2780  * While we can't fit the entire write in this
2781  * extent, we know that the write goes from cpos
2782  * to the end of the extent. Break that off.
2783  * We try to break it at some multiple of
2784  * contig_clusters from the front of the extent.
2785  * Failing that (ie, cpos is within
2786  * contig_clusters of the front), we'll CoW the
2787  * entire extent.
2788  */
2789  *cow_start = ocfs2_cow_align_start(inode->i_sb,
2790  *cow_start, cpos);
2791  *cow_len = rec_end - *cow_start;
2792  } else {
2793  /*
2794  * Ok, the entire write lives in the middle of
2795  * this extent. Let's try to slice the extent up
2796  * nicely. Optimally, our CoW region starts at
2797  * m*contig_clusters from the beginning of the
2798  * extent and goes for n*contig_clusters,
2799  * covering the entire write.
2800  */
2801  *cow_start = ocfs2_cow_align_start(inode->i_sb,
2802  *cow_start, cpos);
2803 
2804  want_clusters = (cpos + write_len) - *cow_start;
2805  want_clusters = ocfs2_cow_align_length(inode->i_sb,
2806  want_clusters);
2807  if (*cow_start + want_clusters <= rec_end)
2808  *cow_len = want_clusters;
2809  else
2810  *cow_len = rec_end - *cow_start;
2811  }
2812 
2813  /* Have we covered our entire write yet? */
2814  if ((*cow_start + *cow_len) >= (cpos + write_len))
2815  break;
2816 
2817  /*
2818  * If we reach the end of the extent block and don't get enough
2819  * clusters, continue with the next extent block if possible.
2820  */
2821  if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2822  eb && eb->h_next_leaf_blk) {
2823  brelse(eb_bh);
2824  eb_bh = NULL;
2825 
2826  ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2828  &eb_bh);
2829  if (ret) {
2830  mlog_errno(ret);
2831  goto out;
2832  }
2833 
2834  eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2835  el = &eb->h_list;
2836  i = -1;
2837  }
2838  }
2839 
2840 out:
2841  brelse(eb_bh);
2842  return ret;
2843 }
2844 
2845 /*
2846  * Prepare meta_ac, data_ac and calculate credits when we want to add some
2847  * num_clusters in data_tree "et" and change the refcount for the old
2848  * clusters(starting form p_cluster) in the refcount tree.
2849  *
2850  * Note:
2851  * 1. since we may split the old tree, so we at most will need num_clusters + 2
2852  * more new leaf records.
2853  * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2854  * just give data_ac = NULL.
2855  */
2856 static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2857  u32 p_cluster, u32 num_clusters,
2858  struct ocfs2_extent_tree *et,
2859  struct ocfs2_caching_info *ref_ci,
2860  struct buffer_head *ref_root_bh,
2861  struct ocfs2_alloc_context **meta_ac,
2862  struct ocfs2_alloc_context **data_ac,
2863  int *credits)
2864 {
2865  int ret = 0, meta_add = 0;
2866  int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2867 
2868  if (num_free_extents < 0) {
2869  ret = num_free_extents;
2870  mlog_errno(ret);
2871  goto out;
2872  }
2873 
2874  if (num_free_extents < num_clusters + 2)
2875  meta_add =
2876  ocfs2_extend_meta_needed(et->et_root_el);
2877 
2878  *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2879  num_clusters + 2);
2880 
2881  ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2882  p_cluster, num_clusters,
2883  &meta_add, credits);
2884  if (ret) {
2885  mlog_errno(ret);
2886  goto out;
2887  }
2888 
2889  trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
2890  ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2891  meta_ac);
2892  if (ret) {
2893  mlog_errno(ret);
2894  goto out;
2895  }
2896 
2897  if (data_ac) {
2898  ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2899  data_ac);
2900  if (ret)
2901  mlog_errno(ret);
2902  }
2903 
2904 out:
2905  if (ret) {
2906  if (*meta_ac) {
2907  ocfs2_free_alloc_context(*meta_ac);
2908  *meta_ac = NULL;
2909  }
2910  }
2911 
2912  return ret;
2913 }
2914 
2915 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2916 {
2917  BUG_ON(buffer_dirty(bh));
2918 
2919  clear_buffer_mapped(bh);
2920 
2921  return 0;
2922 }
2923 
2925  struct file *file,
2926  u32 cpos, u32 old_cluster,
2927  u32 new_cluster, u32 new_len)
2928 {
2929  int ret = 0, partial;
2930  struct inode *inode = file->f_path.dentry->d_inode;
2931  struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2933  u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2934  struct page *page;
2935  pgoff_t page_index;
2936  unsigned int from, to, readahead_pages;
2937  loff_t offset, end, map_end;
2938  struct address_space *mapping = inode->i_mapping;
2939 
2940  trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2941  new_cluster, new_len);
2942 
2943  readahead_pages =
2944  (ocfs2_cow_contig_clusters(sb) <<
2945  OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2946  offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2947  end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2948  /*
2949  * We only duplicate pages until we reach the page contains i_size - 1.
2950  * So trim 'end' to i_size.
2951  */
2952  if (end > i_size_read(inode))
2953  end = i_size_read(inode);
2954 
2955  while (offset < end) {
2956  page_index = offset >> PAGE_CACHE_SHIFT;
2957  map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2958  if (map_end > end)
2959  map_end = end;
2960 
2961  /* from, to is the offset within the page. */
2962  from = offset & (PAGE_CACHE_SIZE - 1);
2963  to = PAGE_CACHE_SIZE;
2964  if (map_end & (PAGE_CACHE_SIZE - 1))
2965  to = map_end & (PAGE_CACHE_SIZE - 1);
2966 
2967  page = find_or_create_page(mapping, page_index, GFP_NOFS);
2968 
2969  /*
2970  * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2971  * can't be dirtied before we CoW it out.
2972  */
2973  if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2974  BUG_ON(PageDirty(page));
2975 
2976  if (PageReadahead(page)) {
2978  &file->f_ra, file,
2979  page, page_index,
2980  readahead_pages);
2981  }
2982 
2983  if (!PageUptodate(page)) {
2985  if (ret) {
2986  mlog_errno(ret);
2987  goto unlock;
2988  }
2989  lock_page(page);
2990  }
2991 
2992  if (page_has_buffers(page)) {
2993  ret = walk_page_buffers(handle, page_buffers(page),
2994  from, to, &partial,
2995  ocfs2_clear_cow_buffer);
2996  if (ret) {
2997  mlog_errno(ret);
2998  goto unlock;
2999  }
3000  }
3001 
3002  ocfs2_map_and_dirty_page(inode, handle, from, to,
3003  page, 0, &new_block);
3004  mark_page_accessed(page);
3005 unlock:
3006  unlock_page(page);
3007  page_cache_release(page);
3008  page = NULL;
3009  offset = map_end;
3010  if (ret)
3011  break;
3012  }
3013 
3014  return ret;
3015 }
3016 
3017 int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3018  struct file *file,
3019  u32 cpos, u32 old_cluster,
3020  u32 new_cluster, u32 new_len)
3021 {
3022  int ret = 0;
3023  struct inode *inode = file->f_path.dentry->d_inode;
3024  struct super_block *sb = inode->i_sb;
3025  struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026  int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027  u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028  u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3029  struct ocfs2_super *osb = OCFS2_SB(sb);
3030  struct buffer_head *old_bh = NULL;
3031  struct buffer_head *new_bh = NULL;
3032 
3033  trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
3034  new_cluster, new_len);
3035 
3036  for (i = 0; i < blocks; i++, old_block++, new_block++) {
3037  new_bh = sb_getblk(osb->sb, new_block);
3038  if (new_bh == NULL) {
3039  ret = -EIO;
3040  mlog_errno(ret);
3041  break;
3042  }
3043 
3044  ocfs2_set_new_buffer_uptodate(ci, new_bh);
3045 
3046  ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3047  if (ret) {
3048  mlog_errno(ret);
3049  break;
3050  }
3051 
3052  ret = ocfs2_journal_access(handle, ci, new_bh,
3054  if (ret) {
3055  mlog_errno(ret);
3056  break;
3057  }
3058 
3059  memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3060  ocfs2_journal_dirty(handle, new_bh);
3061 
3062  brelse(new_bh);
3063  brelse(old_bh);
3064  new_bh = NULL;
3065  old_bh = NULL;
3066  }
3067 
3068  brelse(new_bh);
3069  brelse(old_bh);
3070  return ret;
3071 }
3072 
3073 static int ocfs2_clear_ext_refcount(handle_t *handle,
3074  struct ocfs2_extent_tree *et,
3075  u32 cpos, u32 p_cluster, u32 len,
3076  unsigned int ext_flags,
3077  struct ocfs2_alloc_context *meta_ac,
3078  struct ocfs2_cached_dealloc_ctxt *dealloc)
3079 {
3080  int ret, index;
3081  struct ocfs2_extent_rec replace_rec;
3082  struct ocfs2_path *path = NULL;
3083  struct ocfs2_extent_list *el;
3086 
3087  trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
3088  cpos, len, p_cluster, ext_flags);
3089 
3090  memset(&replace_rec, 0, sizeof(replace_rec));
3091  replace_rec.e_cpos = cpu_to_le32(cpos);
3092  replace_rec.e_leaf_clusters = cpu_to_le16(len);
3093  replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3094  p_cluster));
3095  replace_rec.e_flags = ext_flags;
3096  replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3097 
3098  path = ocfs2_new_path_from_et(et);
3099  if (!path) {
3100  ret = -ENOMEM;
3101  mlog_errno(ret);
3102  goto out;
3103  }
3104 
3105  ret = ocfs2_find_path(et->et_ci, path, cpos);
3106  if (ret) {
3107  mlog_errno(ret);
3108  goto out;
3109  }
3110 
3111  el = path_leaf_el(path);
3112 
3113  index = ocfs2_search_extent_list(el, cpos);
3114  if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3115  ocfs2_error(sb,
3116  "Inode %llu has an extent at cpos %u which can no "
3117  "longer be found.\n",
3118  (unsigned long long)ino, cpos);
3119  ret = -EROFS;
3120  goto out;
3121  }
3122 
3123  ret = ocfs2_split_extent(handle, et, path, index,
3124  &replace_rec, meta_ac, dealloc);
3125  if (ret)
3126  mlog_errno(ret);
3127 
3128 out:
3129  ocfs2_free_path(path);
3130  return ret;
3131 }
3132 
3133 static int ocfs2_replace_clusters(handle_t *handle,
3134  struct ocfs2_cow_context *context,
3135  u32 cpos, u32 old,
3136  u32 new, u32 len,
3137  unsigned int ext_flags)
3138 {
3139  int ret;
3140  struct ocfs2_caching_info *ci = context->data_et.et_ci;
3141  u64 ino = ocfs2_metadata_cache_owner(ci);
3142 
3143  trace_ocfs2_replace_clusters((unsigned long long)ino,
3144  cpos, old, new, len, ext_flags);
3145 
3146  /*If the old clusters is unwritten, no need to duplicate. */
3147  if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148  ret = context->cow_duplicate_clusters(handle, context->file,
3149  cpos, old, new, len);
3150  if (ret) {
3151  mlog_errno(ret);
3152  goto out;
3153  }
3154  }
3155 
3156  ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3157  cpos, new, len, ext_flags,
3158  context->meta_ac, &context->dealloc);
3159  if (ret)
3160  mlog_errno(ret);
3161 out:
3162  return ret;
3163 }
3164 
3166  struct inode *inode,
3167  u32 cpos, u32 num_clusters)
3168 {
3169  int ret = 0;
3170  loff_t offset, end, map_end;
3171  pgoff_t page_index;
3172  struct page *page;
3173 
3174  if (ocfs2_should_order_data(inode))
3175  return 0;
3176 
3177  offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178  end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179 
3180  ret = filemap_fdatawrite_range(inode->i_mapping,
3181  offset, end - 1);
3182  if (ret < 0) {
3183  mlog_errno(ret);
3184  return ret;
3185  }
3186 
3187  while (offset < end) {
3188  page_index = offset >> PAGE_CACHE_SHIFT;
3189  map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3190  if (map_end > end)
3191  map_end = end;
3192 
3193  page = find_or_create_page(inode->i_mapping,
3194  page_index, GFP_NOFS);
3195  BUG_ON(!page);
3196 
3197  wait_on_page_writeback(page);
3198  if (PageError(page)) {
3199  ret = -EIO;
3200  mlog_errno(ret);
3201  } else
3202  mark_page_accessed(page);
3203 
3204  unlock_page(page);
3205  page_cache_release(page);
3206  page = NULL;
3207  offset = map_end;
3208  if (ret)
3209  break;
3210  }
3211 
3212  return ret;
3213 }
3214 
3215 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3216  u32 v_cluster, u32 *p_cluster,
3217  u32 *num_clusters,
3218  unsigned int *extent_flags)
3219 {
3220  return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3221  num_clusters, extent_flags);
3222 }
3223 
3224 static int ocfs2_make_clusters_writable(struct super_block *sb,
3225  struct ocfs2_cow_context *context,
3226  u32 cpos, u32 p_cluster,
3227  u32 num_clusters, unsigned int e_flags)
3228 {
3229  int ret, delete, index, credits = 0;
3230  u32 new_bit, new_len, orig_num_clusters;
3231  unsigned int set_len;
3232  struct ocfs2_super *osb = OCFS2_SB(sb);
3233  handle_t *handle;
3234  struct buffer_head *ref_leaf_bh = NULL;
3235  struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3236  struct ocfs2_refcount_rec rec;
3237 
3238  trace_ocfs2_make_clusters_writable(cpos, p_cluster,
3239  num_clusters, e_flags);
3240 
3241  ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3242  &context->data_et,
3243  ref_ci,
3244  context->ref_root_bh,
3245  &context->meta_ac,
3246  &context->data_ac, &credits);
3247  if (ret) {
3248  mlog_errno(ret);
3249  return ret;
3250  }
3251 
3252  if (context->post_refcount)
3253  credits += context->post_refcount->credits;
3254 
3255  credits += context->extra_credits;
3256  handle = ocfs2_start_trans(osb, credits);
3257  if (IS_ERR(handle)) {
3258  ret = PTR_ERR(handle);
3259  mlog_errno(ret);
3260  goto out;
3261  }
3262 
3263  orig_num_clusters = num_clusters;
3264 
3265  while (num_clusters) {
3266  ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3267  p_cluster, num_clusters,
3268  &rec, &index, &ref_leaf_bh);
3269  if (ret) {
3270  mlog_errno(ret);
3271  goto out_commit;
3272  }
3273 
3274  BUG_ON(!rec.r_refcount);
3275  set_len = min((u64)p_cluster + num_clusters,
3276  le64_to_cpu(rec.r_cpos) +
3277  le32_to_cpu(rec.r_clusters)) - p_cluster;
3278 
3279  /*
3280  * There are many different situation here.
3281  * 1. If refcount == 1, remove the flag and don't COW.
3282  * 2. If refcount > 1, allocate clusters.
3283  * Here we may not allocate r_len once at a time, so continue
3284  * until we reach num_clusters.
3285  */
3286  if (le32_to_cpu(rec.r_refcount) == 1) {
3287  delete = 0;
3288  ret = ocfs2_clear_ext_refcount(handle,
3289  &context->data_et,
3290  cpos, p_cluster,
3291  set_len, e_flags,
3292  context->meta_ac,
3293  &context->dealloc);
3294  if (ret) {
3295  mlog_errno(ret);
3296  goto out_commit;
3297  }
3298  } else {
3299  delete = 1;
3300 
3301  ret = __ocfs2_claim_clusters(handle,
3302  context->data_ac,
3303  1, set_len,
3304  &new_bit, &new_len);
3305  if (ret) {
3306  mlog_errno(ret);
3307  goto out_commit;
3308  }
3309 
3310  ret = ocfs2_replace_clusters(handle, context,
3311  cpos, p_cluster, new_bit,
3312  new_len, e_flags);
3313  if (ret) {
3314  mlog_errno(ret);
3315  goto out_commit;
3316  }
3317  set_len = new_len;
3318  }
3319 
3320  ret = __ocfs2_decrease_refcount(handle, ref_ci,
3321  context->ref_root_bh,
3322  p_cluster, set_len,
3323  context->meta_ac,
3324  &context->dealloc, delete);
3325  if (ret) {
3326  mlog_errno(ret);
3327  goto out_commit;
3328  }
3329 
3330  cpos += set_len;
3331  p_cluster += set_len;
3332  num_clusters -= set_len;
3333  brelse(ref_leaf_bh);
3334  ref_leaf_bh = NULL;
3335  }
3336 
3337  /* handle any post_cow action. */
3338  if (context->post_refcount && context->post_refcount->func) {
3339  ret = context->post_refcount->func(context->inode, handle,
3340  context->post_refcount->para);
3341  if (ret) {
3342  mlog_errno(ret);
3343  goto out_commit;
3344  }
3345  }
3346 
3347  /*
3348  * Here we should write the new page out first if we are
3349  * in write-back mode.
3350  */
3351  if (context->get_clusters == ocfs2_di_get_clusters) {
3352  ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353  orig_num_clusters);
3354  if (ret)
3355  mlog_errno(ret);
3356  }
3357 
3358 out_commit:
3359  ocfs2_commit_trans(osb, handle);
3360 
3361 out:
3362  if (context->data_ac) {
3364  context->data_ac = NULL;
3365  }
3366  if (context->meta_ac) {
3368  context->meta_ac = NULL;
3369  }
3370  brelse(ref_leaf_bh);
3371 
3372  return ret;
3373 }
3374 
3375 static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3376 {
3377  int ret = 0;
3378  struct inode *inode = context->inode;
3379  u32 cow_start = context->cow_start, cow_len = context->cow_len;
3380  u32 p_cluster, num_clusters;
3381  unsigned int ext_flags;
3382  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3383 
3384  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3385  ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3386  "tree, but the feature bit is not set in the "
3387  "super block.", inode->i_ino);
3388  return -EROFS;
3389  }
3390 
3391  ocfs2_init_dealloc_ctxt(&context->dealloc);
3392 
3393  while (cow_len) {
3394  ret = context->get_clusters(context, cow_start, &p_cluster,
3395  &num_clusters, &ext_flags);
3396  if (ret) {
3397  mlog_errno(ret);
3398  break;
3399  }
3400 
3401  BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3402 
3403  if (cow_len < num_clusters)
3404  num_clusters = cow_len;
3405 
3406  ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3407  cow_start, p_cluster,
3408  num_clusters, ext_flags);
3409  if (ret) {
3410  mlog_errno(ret);
3411  break;
3412  }
3413 
3414  cow_len -= num_clusters;
3415  cow_start += num_clusters;
3416  }
3417 
3418  if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3420  ocfs2_run_deallocs(osb, &context->dealloc);
3421  }
3422 
3423  return ret;
3424 }
3425 
3426 static void ocfs2_readahead_for_cow(struct inode *inode,
3427  struct file *file,
3428  u32 start, u32 len)
3429 {
3430  struct address_space *mapping;
3431  pgoff_t index;
3432  unsigned long num_pages;
3433  int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3434 
3435  if (!file)
3436  return;
3437 
3438  mapping = file->f_mapping;
3439  num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3440  if (!num_pages)
3441  num_pages = 1;
3442 
3443  index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3444  page_cache_sync_readahead(mapping, &file->f_ra, file,
3445  index, num_pages);
3446 }
3447 
3448 /*
3449  * Starting at cpos, try to CoW write_len clusters. Don't CoW
3450  * past max_cpos. This will stop when it runs into a hole or an
3451  * unrefcounted extent.
3452  */
3453 static int ocfs2_refcount_cow_hunk(struct inode *inode,
3454  struct file *file,
3455  struct buffer_head *di_bh,
3456  u32 cpos, u32 write_len, u32 max_cpos)
3457 {
3458  int ret;
3459  u32 cow_start = 0, cow_len = 0;
3460  struct ocfs2_inode_info *oi = OCFS2_I(inode);
3461  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3462  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3463  struct buffer_head *ref_root_bh = NULL;
3464  struct ocfs2_refcount_tree *ref_tree;
3465  struct ocfs2_cow_context *context = NULL;
3466 
3468 
3469  ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3470  cpos, write_len, max_cpos,
3471  &cow_start, &cow_len);
3472  if (ret) {
3473  mlog_errno(ret);
3474  goto out;
3475  }
3476 
3477  trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
3478  cpos, write_len, max_cpos,
3479  cow_start, cow_len);
3480 
3481  BUG_ON(cow_len == 0);
3482 
3483  ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3484 
3485  context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3486  if (!context) {
3487  ret = -ENOMEM;
3488  mlog_errno(ret);
3489  goto out;
3490  }
3491 
3493  1, &ref_tree, &ref_root_bh);
3494  if (ret) {
3495  mlog_errno(ret);
3496  goto out;
3497  }
3498 
3499  context->inode = inode;
3500  context->cow_start = cow_start;
3501  context->cow_len = cow_len;
3502  context->ref_tree = ref_tree;
3503  context->ref_root_bh = ref_root_bh;
3505  context->get_clusters = ocfs2_di_get_clusters;
3506  context->file = file;
3507 
3509  INODE_CACHE(inode), di_bh);
3510 
3511  ret = ocfs2_replace_cow(context);
3512  if (ret)
3513  mlog_errno(ret);
3514 
3515  /*
3516  * truncate the extent map here since no matter whether we meet with
3517  * any error during the action, we shouldn't trust cached extent map
3518  * any more.
3519  */
3520  ocfs2_extent_map_trunc(inode, cow_start);
3521 
3522  ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3523  brelse(ref_root_bh);
3524 out:
3525  kfree(context);
3526  return ret;
3527 }
3528 
3529 /*
3530  * CoW any and all clusters between cpos and cpos+write_len.
3531  * Don't CoW past max_cpos. If this returns successfully, all
3532  * clusters between cpos and cpos+write_len are safe to modify.
3533  */
3534 int ocfs2_refcount_cow(struct inode *inode,
3535  struct file *file,
3536  struct buffer_head *di_bh,
3537  u32 cpos, u32 write_len, u32 max_cpos)
3538 {
3539  int ret = 0;
3540  u32 p_cluster, num_clusters;
3541  unsigned int ext_flags;
3542 
3543  while (write_len) {
3544  ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3545  &num_clusters, &ext_flags);
3546  if (ret) {
3547  mlog_errno(ret);
3548  break;
3549  }
3550 
3551  if (write_len < num_clusters)
3552  num_clusters = write_len;
3553 
3554  if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3555  ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3556  num_clusters, max_cpos);
3557  if (ret) {
3558  mlog_errno(ret);
3559  break;
3560  }
3561  }
3562 
3563  write_len -= num_clusters;
3564  cpos += num_clusters;
3565  }
3566 
3567  return ret;
3568 }
3569 
3570 static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3571  u32 v_cluster, u32 *p_cluster,
3572  u32 *num_clusters,
3573  unsigned int *extent_flags)
3574 {
3575  struct inode *inode = context->inode;
3576  struct ocfs2_xattr_value_root *xv = context->cow_object;
3577 
3578  return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3579  num_clusters, &xv->xr_list,
3580  extent_flags);
3581 }
3582 
3583 /*
3584  * Given a xattr value root, calculate the most meta/credits we need for
3585  * refcount tree change if we truncate it to 0.
3586  */
3587 int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3588  struct ocfs2_caching_info *ref_ci,
3589  struct buffer_head *ref_root_bh,
3590  struct ocfs2_xattr_value_root *xv,
3591  int *meta_add, int *credits)
3592 {
3593  int ret = 0, index, ref_blocks = 0;
3594  u32 p_cluster, num_clusters;
3595  u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3596  struct ocfs2_refcount_block *rb;
3597  struct ocfs2_refcount_rec rec;
3598  struct buffer_head *ref_leaf_bh = NULL;
3599 
3600  while (cpos < clusters) {
3601  ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3602  &num_clusters, &xv->xr_list,
3603  NULL);
3604  if (ret) {
3605  mlog_errno(ret);
3606  goto out;
3607  }
3608 
3609  cpos += num_clusters;
3610 
3611  while (num_clusters) {
3612  ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3613  p_cluster, num_clusters,
3614  &rec, &index,
3615  &ref_leaf_bh);
3616  if (ret) {
3617  mlog_errno(ret);
3618  goto out;
3619  }
3620 
3621  BUG_ON(!rec.r_refcount);
3622 
3623  rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3624 
3625  /*
3626  * We really don't know whether the other clusters is in
3627  * this refcount block or not, so just take the worst
3628  * case that all the clusters are in this block and each
3629  * one will split a refcount rec, so totally we need
3630  * clusters * 2 new refcount rec.
3631  */
3632  if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3633  le16_to_cpu(rb->rf_records.rl_count))
3634  ref_blocks++;
3635 
3636  *credits += 1;
3637  brelse(ref_leaf_bh);
3638  ref_leaf_bh = NULL;
3639 
3640  if (num_clusters <= le32_to_cpu(rec.r_clusters))
3641  break;
3642  else
3643  num_clusters -= le32_to_cpu(rec.r_clusters);
3644  p_cluster += num_clusters;
3645  }
3646  }
3647 
3648  *meta_add += ref_blocks;
3649  if (!ref_blocks)
3650  goto out;
3651 
3652  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3655  else {
3656  struct ocfs2_extent_tree et;
3657 
3658  ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3659  *credits += ocfs2_calc_extend_credits(inode->i_sb,
3660  et.et_root_el,
3661  ref_blocks);
3662  }
3663 
3664 out:
3665  brelse(ref_leaf_bh);
3666  return ret;
3667 }
3668 
3669 /*
3670  * Do CoW for xattr.
3671  */
3672 int ocfs2_refcount_cow_xattr(struct inode *inode,
3673  struct ocfs2_dinode *di,
3674  struct ocfs2_xattr_value_buf *vb,
3675  struct ocfs2_refcount_tree *ref_tree,
3676  struct buffer_head *ref_root_bh,
3677  u32 cpos, u32 write_len,
3678  struct ocfs2_post_refcount *post)
3679 {
3680  int ret;
3681  struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3682  struct ocfs2_inode_info *oi = OCFS2_I(inode);
3683  struct ocfs2_cow_context *context = NULL;
3685 
3687 
3688  ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3689  cpos, write_len, UINT_MAX,
3690  &cow_start, &cow_len);
3691  if (ret) {
3692  mlog_errno(ret);
3693  goto out;
3694  }
3695 
3696  BUG_ON(cow_len == 0);
3697 
3698  context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3699  if (!context) {
3700  ret = -ENOMEM;
3701  mlog_errno(ret);
3702  goto out;
3703  }
3704 
3705  context->inode = inode;
3706  context->cow_start = cow_start;
3707  context->cow_len = cow_len;
3708  context->ref_tree = ref_tree;
3709  context->ref_root_bh = ref_root_bh;
3710  context->cow_object = xv;
3711 
3713  /* We need the extra credits for duplicate_clusters by jbd. */
3714  context->extra_credits =
3715  ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3716  context->get_clusters = ocfs2_xattr_value_get_clusters;
3717  context->post_refcount = post;
3718 
3720  INODE_CACHE(inode), vb);
3721 
3722  ret = ocfs2_replace_cow(context);
3723  if (ret)
3724  mlog_errno(ret);
3725 
3726 out:
3727  kfree(context);
3728  return ret;
3729 }
3730 
3731 /*
3732  * Insert a new extent into refcount tree and mark a extent rec
3733  * as refcounted in the dinode tree.
3734  */
3735 int ocfs2_add_refcount_flag(struct inode *inode,
3736  struct ocfs2_extent_tree *data_et,
3737  struct ocfs2_caching_info *ref_ci,
3738  struct buffer_head *ref_root_bh,
3739  u32 cpos, u32 p_cluster, u32 num_clusters,
3740  struct ocfs2_cached_dealloc_ctxt *dealloc,
3741  struct ocfs2_post_refcount *post)
3742 {
3743  int ret;
3744  handle_t *handle;
3745  int credits = 1, ref_blocks = 0;
3746  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3747  struct ocfs2_alloc_context *meta_ac = NULL;
3748 
3749  ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3750  ref_ci, ref_root_bh,
3751  p_cluster, num_clusters,
3752  &ref_blocks, &credits);
3753  if (ret) {
3754  mlog_errno(ret);
3755  goto out;
3756  }
3757 
3758  trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3759 
3760  if (ref_blocks) {
3762  ref_blocks, &meta_ac);
3763  if (ret) {
3764  mlog_errno(ret);
3765  goto out;
3766  }
3767  }
3768 
3769  if (post)
3770  credits += post->credits;
3771 
3772  handle = ocfs2_start_trans(osb, credits);
3773  if (IS_ERR(handle)) {
3774  ret = PTR_ERR(handle);
3775  mlog_errno(ret);
3776  goto out;
3777  }
3778 
3779  ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3780  cpos, num_clusters, p_cluster,
3781  meta_ac, dealloc);
3782  if (ret) {
3783  mlog_errno(ret);
3784  goto out_commit;
3785  }
3786 
3787  ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3788  p_cluster, num_clusters, 0,
3789  meta_ac, dealloc);
3790  if (ret) {
3791  mlog_errno(ret);
3792  goto out_commit;
3793  }
3794 
3795  if (post && post->func) {
3796  ret = post->func(inode, handle, post->para);
3797  if (ret)
3798  mlog_errno(ret);
3799  }
3800 
3801 out_commit:
3802  ocfs2_commit_trans(osb, handle);
3803 out:
3804  if (meta_ac)
3805  ocfs2_free_alloc_context(meta_ac);
3806  return ret;
3807 }
3808 
3809 static int ocfs2_change_ctime(struct inode *inode,
3810  struct buffer_head *di_bh)
3811 {
3812  int ret;
3813  handle_t *handle;
3814  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3815 
3816  handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3818  if (IS_ERR(handle)) {
3819  ret = PTR_ERR(handle);
3820  mlog_errno(ret);
3821  goto out;
3822  }
3823 
3824  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3826  if (ret) {
3827  mlog_errno(ret);
3828  goto out_commit;
3829  }
3830 
3831  inode->i_ctime = CURRENT_TIME;
3832  di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3833  di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3834 
3835  ocfs2_journal_dirty(handle, di_bh);
3836 
3837 out_commit:
3838  ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3839 out:
3840  return ret;
3841 }
3842 
3843 static int ocfs2_attach_refcount_tree(struct inode *inode,
3844  struct buffer_head *di_bh)
3845 {
3846  int ret, data_changed = 0;
3847  struct buffer_head *ref_root_bh = NULL;
3848  struct ocfs2_inode_info *oi = OCFS2_I(inode);
3849  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3850  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3851  struct ocfs2_refcount_tree *ref_tree;
3852  unsigned int ext_flags;
3853  loff_t size;
3854  u32 cpos, num_clusters, clusters, p_cluster;
3855  struct ocfs2_cached_dealloc_ctxt dealloc;
3856  struct ocfs2_extent_tree di_et;
3857 
3858  ocfs2_init_dealloc_ctxt(&dealloc);
3859 
3860  if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3861  ret = ocfs2_create_refcount_tree(inode, di_bh);
3862  if (ret) {
3863  mlog_errno(ret);
3864  goto out;
3865  }
3866  }
3867 
3868  BUG_ON(!di->i_refcount_loc);
3869  ret = ocfs2_lock_refcount_tree(osb,
3870  le64_to_cpu(di->i_refcount_loc), 1,
3871  &ref_tree, &ref_root_bh);
3872  if (ret) {
3873  mlog_errno(ret);
3874  goto out;
3875  }
3876 
3878  goto attach_xattr;
3879 
3880  ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3881 
3882  size = i_size_read(inode);
3883  clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3884 
3885  cpos = 0;
3886  while (cpos < clusters) {
3887  ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3888  &num_clusters, &ext_flags);
3889 
3890  if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3891  ret = ocfs2_add_refcount_flag(inode, &di_et,
3892  &ref_tree->rf_ci,
3893  ref_root_bh, cpos,
3894  p_cluster, num_clusters,
3895  &dealloc, NULL);
3896  if (ret) {
3897  mlog_errno(ret);
3898  goto unlock;
3899  }
3900 
3901  data_changed = 1;
3902  }
3903  cpos += num_clusters;
3904  }
3905 
3906 attach_xattr:
3907  if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3908  ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3909  &ref_tree->rf_ci,
3910  ref_root_bh,
3911  &dealloc);
3912  if (ret) {
3913  mlog_errno(ret);
3914  goto unlock;
3915  }
3916  }
3917 
3918  if (data_changed) {
3919  ret = ocfs2_change_ctime(inode, di_bh);
3920  if (ret)
3921  mlog_errno(ret);
3922  }
3923 
3924 unlock:
3925  ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3926  brelse(ref_root_bh);
3927 
3928  if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3930  ocfs2_run_deallocs(osb, &dealloc);
3931  }
3932 out:
3933  /*
3934  * Empty the extent map so that we may get the right extent
3935  * record from the disk.
3936  */
3937  ocfs2_extent_map_trunc(inode, 0);
3938 
3939  return ret;
3940 }
3941 
3942 static int ocfs2_add_refcounted_extent(struct inode *inode,
3943  struct ocfs2_extent_tree *et,
3944  struct ocfs2_caching_info *ref_ci,
3945  struct buffer_head *ref_root_bh,
3946  u32 cpos, u32 p_cluster, u32 num_clusters,
3947  unsigned int ext_flags,
3948  struct ocfs2_cached_dealloc_ctxt *dealloc)
3949 {
3950  int ret;
3951  handle_t *handle;
3952  int credits = 0;
3953  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3954  struct ocfs2_alloc_context *meta_ac = NULL;
3955 
3956  ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3957  p_cluster, num_clusters,
3958  et, ref_ci,
3959  ref_root_bh, &meta_ac,
3960  NULL, &credits);
3961  if (ret) {
3962  mlog_errno(ret);
3963  goto out;
3964  }
3965 
3966  handle = ocfs2_start_trans(osb, credits);
3967  if (IS_ERR(handle)) {
3968  ret = PTR_ERR(handle);
3969  mlog_errno(ret);
3970  goto out;
3971  }
3972 
3973  ret = ocfs2_insert_extent(handle, et, cpos,
3974  ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3975  num_clusters, ext_flags, meta_ac);
3976  if (ret) {
3977  mlog_errno(ret);
3978  goto out_commit;
3979  }
3980 
3981  ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3982  p_cluster, num_clusters,
3983  meta_ac, dealloc);
3984  if (ret)
3985  mlog_errno(ret);
3986 
3987 out_commit:
3988  ocfs2_commit_trans(osb, handle);
3989 out:
3990  if (meta_ac)
3991  ocfs2_free_alloc_context(meta_ac);
3992  return ret;
3993 }
3994 
3995 static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3996  struct buffer_head *s_bh,
3997  struct inode *t_inode,
3998  struct buffer_head *t_bh)
3999 {
4000  int ret;
4001  handle_t *handle;
4002  struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4003  struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4004  struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
4005 
4006  BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
4007 
4009  if (IS_ERR(handle)) {
4010  ret = PTR_ERR(handle);
4011  mlog_errno(ret);
4012  goto out;
4013  }
4014 
4015  ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4017  if (ret) {
4018  mlog_errno(ret);
4019  goto out_commit;
4020  }
4021 
4022  t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
4023  memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
4024  le16_to_cpu(s_di->id2.i_data.id_count));
4025  spin_lock(&OCFS2_I(t_inode)->ip_lock);
4026  OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
4027  t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
4028  spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4029 
4030  ocfs2_journal_dirty(handle, t_bh);
4031 
4032 out_commit:
4033  ocfs2_commit_trans(osb, handle);
4034 out:
4035  return ret;
4036 }
4037 
4038 static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4039  struct inode *t_inode,
4040  struct buffer_head *t_bh,
4041  struct ocfs2_caching_info *ref_ci,
4042  struct buffer_head *ref_root_bh,
4043  struct ocfs2_cached_dealloc_ctxt *dealloc)
4044 {
4045  int ret = 0;
4046  u32 p_cluster, num_clusters, clusters, cpos;
4047  loff_t size;
4048  unsigned int ext_flags;
4049  struct ocfs2_extent_tree et;
4050 
4051  ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
4052 
4053  size = i_size_read(s_inode);
4054  clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4055 
4056  cpos = 0;
4057  while (cpos < clusters) {
4058  ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4059  &num_clusters, &ext_flags);
4060 
4061  if (p_cluster) {
4062  ret = ocfs2_add_refcounted_extent(t_inode, &et,
4063  ref_ci, ref_root_bh,
4064  cpos, p_cluster,
4065  num_clusters,
4066  ext_flags,
4067  dealloc);
4068  if (ret) {
4069  mlog_errno(ret);
4070  goto out;
4071  }
4072  }
4073 
4074  cpos += num_clusters;
4075  }
4076 
4077 out:
4078  return ret;
4079 }
4080 
4081 /*
4082  * change the new file's attributes to the src.
4083  *
4084  * reflink creates a snapshot of a file, that means the attributes
4085  * must be identical except for three exceptions - nlink, ino, and ctime.
4086  */
4087 static int ocfs2_complete_reflink(struct inode *s_inode,
4088  struct buffer_head *s_bh,
4089  struct inode *t_inode,
4090  struct buffer_head *t_bh,
4091  bool preserve)
4092 {
4093  int ret;
4094  handle_t *handle;
4095  struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4096  struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4097  loff_t size = i_size_read(s_inode);
4098 
4099  handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4101  if (IS_ERR(handle)) {
4102  ret = PTR_ERR(handle);
4103  mlog_errno(ret);
4104  return ret;
4105  }
4106 
4107  ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4109  if (ret) {
4110  mlog_errno(ret);
4111  goto out_commit;
4112  }
4113 
4114  spin_lock(&OCFS2_I(t_inode)->ip_lock);
4115  OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4116  OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4117  OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4118  spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4119  i_size_write(t_inode, size);
4120  t_inode->i_blocks = s_inode->i_blocks;
4121 
4123  di->i_clusters = s_di->i_clusters;
4124  di->i_size = s_di->i_size;
4125  di->i_dyn_features = s_di->i_dyn_features;
4126  di->i_attr = s_di->i_attr;
4127 
4128  if (preserve) {
4129  t_inode->i_uid = s_inode->i_uid;
4130  t_inode->i_gid = s_inode->i_gid;
4131  t_inode->i_mode = s_inode->i_mode;
4132  di->i_uid = s_di->i_uid;
4133  di->i_gid = s_di->i_gid;
4134  di->i_mode = s_di->i_mode;
4135 
4136  /*
4137  * update time.
4138  * we want mtime to appear identical to the source and
4139  * update ctime.
4140  */
4141  t_inode->i_ctime = CURRENT_TIME;
4142 
4143  di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4144  di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4145 
4146  t_inode->i_mtime = s_inode->i_mtime;
4147  di->i_mtime = s_di->i_mtime;
4148  di->i_mtime_nsec = s_di->i_mtime_nsec;
4149  }
4150 
4151  ocfs2_journal_dirty(handle, t_bh);
4152 
4153 out_commit:
4154  ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4155  return ret;
4156 }
4157 
4158 static int ocfs2_create_reflink_node(struct inode *s_inode,
4159  struct buffer_head *s_bh,
4160  struct inode *t_inode,
4161  struct buffer_head *t_bh,
4162  bool preserve)
4163 {
4164  int ret;
4165  struct buffer_head *ref_root_bh = NULL;
4166  struct ocfs2_cached_dealloc_ctxt dealloc;
4167  struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4168  struct ocfs2_refcount_block *rb;
4169  struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4170  struct ocfs2_refcount_tree *ref_tree;
4171 
4172  ocfs2_init_dealloc_ctxt(&dealloc);
4173 
4174  ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4176  if (ret) {
4177  mlog_errno(ret);
4178  goto out;
4179  }
4180 
4181  if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4182  ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4183  t_inode, t_bh);
4184  if (ret)
4185  mlog_errno(ret);
4186  goto out;
4187  }
4188 
4190  1, &ref_tree, &ref_root_bh);
4191  if (ret) {
4192  mlog_errno(ret);
4193  goto out;
4194  }
4195  rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4196 
4197  ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4198  &ref_tree->rf_ci, ref_root_bh,
4199  &dealloc);
4200  if (ret) {
4201  mlog_errno(ret);
4202  goto out_unlock_refcount;
4203  }
4204 
4205 out_unlock_refcount:
4206  ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4207  brelse(ref_root_bh);
4208 out:
4209  if (ocfs2_dealloc_has_cluster(&dealloc)) {
4211  ocfs2_run_deallocs(osb, &dealloc);
4212  }
4213 
4214  return ret;
4215 }
4216 
4217 static int __ocfs2_reflink(struct dentry *old_dentry,
4218  struct buffer_head *old_bh,
4219  struct inode *new_inode,
4220  bool preserve)
4221 {
4222  int ret;
4223  struct inode *inode = old_dentry->d_inode;
4224  struct buffer_head *new_bh = NULL;
4225 
4226  if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4227  ret = -EINVAL;
4228  mlog_errno(ret);
4229  goto out;
4230  }
4231 
4232  ret = filemap_fdatawrite(inode->i_mapping);
4233  if (ret) {
4234  mlog_errno(ret);
4235  goto out;
4236  }
4237 
4238  ret = ocfs2_attach_refcount_tree(inode, old_bh);
4239  if (ret) {
4240  mlog_errno(ret);
4241  goto out;
4242  }
4243 
4244  mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4245  ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4247  if (ret) {
4248  mlog_errno(ret);
4249  goto out_unlock;
4250  }
4251 
4252  ret = ocfs2_create_reflink_node(inode, old_bh,
4253  new_inode, new_bh, preserve);
4254  if (ret) {
4255  mlog_errno(ret);
4256  goto inode_unlock;
4257  }
4258 
4259  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4260  ret = ocfs2_reflink_xattrs(inode, old_bh,
4261  new_inode, new_bh,
4262  preserve);
4263  if (ret) {
4264  mlog_errno(ret);
4265  goto inode_unlock;
4266  }
4267  }
4268 
4269  ret = ocfs2_complete_reflink(inode, old_bh,
4270  new_inode, new_bh, preserve);
4271  if (ret)
4272  mlog_errno(ret);
4273 
4274 inode_unlock:
4275  ocfs2_inode_unlock(new_inode, 1);
4276  brelse(new_bh);
4277 out_unlock:
4278  mutex_unlock(&new_inode->i_mutex);
4279 out:
4280  if (!ret) {
4281  ret = filemap_fdatawait(inode->i_mapping);
4282  if (ret)
4283  mlog_errno(ret);
4284  }
4285  return ret;
4286 }
4287 
4288 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4289  struct dentry *new_dentry, bool preserve)
4290 {
4291  int error;
4292  struct inode *inode = old_dentry->d_inode;
4293  struct buffer_head *old_bh = NULL;
4294  struct inode *new_orphan_inode = NULL;
4295 
4296  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4297  return -EOPNOTSUPP;
4298 
4299  error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4300  &new_orphan_inode);
4301  if (error) {
4302  mlog_errno(error);
4303  goto out;
4304  }
4305 
4306  error = ocfs2_inode_lock(inode, &old_bh, 1);
4307  if (error) {
4308  mlog_errno(error);
4309  goto out;
4310  }
4311 
4312  down_write(&OCFS2_I(inode)->ip_xattr_sem);
4313  down_write(&OCFS2_I(inode)->ip_alloc_sem);
4314  error = __ocfs2_reflink(old_dentry, old_bh,
4315  new_orphan_inode, preserve);
4316  up_write(&OCFS2_I(inode)->ip_alloc_sem);
4317  up_write(&OCFS2_I(inode)->ip_xattr_sem);
4318 
4319  ocfs2_inode_unlock(inode, 1);
4320  brelse(old_bh);
4321 
4322  if (error) {
4323  mlog_errno(error);
4324  goto out;
4325  }
4326 
4327  /* If the security isn't preserved, we need to re-initialize them. */
4328  if (!preserve) {
4329  error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4330  &new_dentry->d_name);
4331  if (error)
4332  mlog_errno(error);
4333  }
4334 out:
4335  if (!error) {
4336  error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4337  new_dentry);
4338  if (error)
4339  mlog_errno(error);
4340  }
4341 
4342  if (new_orphan_inode) {
4343  /*
4344  * We need to open_unlock the inode no matter whether we
4345  * succeed or not, so that other nodes can delete it later.
4346  */
4347  ocfs2_open_unlock(new_orphan_inode);
4348  if (error)
4349  iput(new_orphan_inode);
4350  }
4351 
4352  return error;
4353 }
4354 
4355 /*
4356  * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4357  * sys_reflink(). This will go away when vfs_reflink() exists in
4358  * fs/namei.c.
4359  */
4360 
4361 /* copied from may_create in VFS. */
4362 static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4363 {
4364  if (child->d_inode)
4365  return -EEXIST;
4366  if (IS_DEADDIR(dir))
4367  return -ENOENT;
4368  return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4369 }
4370 
4379 static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4380  struct dentry *new_dentry, bool preserve)
4381 {
4382  struct inode *inode = old_dentry->d_inode;
4383  int error;
4384 
4385  if (!inode)
4386  return -ENOENT;
4387 
4388  error = ocfs2_may_create(dir, new_dentry);
4389  if (error)
4390  return error;
4391 
4392  if (dir->i_sb != inode->i_sb)
4393  return -EXDEV;
4394 
4395  /*
4396  * A reflink to an append-only or immutable file cannot be created.
4397  */
4398  if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4399  return -EPERM;
4400 
4401  /* Only regular files can be reflinked. */
4402  if (!S_ISREG(inode->i_mode))
4403  return -EPERM;
4404 
4405  /*
4406  * If the caller wants to preserve ownership, they require the
4407  * rights to do so.
4408  */
4409  if (preserve) {
4410  if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4411  return -EPERM;
4412  if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4413  return -EPERM;
4414  }
4415 
4416  /*
4417  * If the caller is modifying any aspect of the attributes, they
4418  * are not creating a snapshot. They need read permission on the
4419  * file.
4420  */
4421  if (!preserve) {
4422  error = inode_permission(inode, MAY_READ);
4423  if (error)
4424  return error;
4425  }
4426 
4427  mutex_lock(&inode->i_mutex);
4428  dquot_initialize(dir);
4429  error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4430  mutex_unlock(&inode->i_mutex);
4431  if (!error)
4432  fsnotify_create(dir, new_dentry);
4433  return error;
4434 }
4435 /*
4436  * Most codes are copied from sys_linkat.
4437  */
4438 int ocfs2_reflink_ioctl(struct inode *inode,
4439  const char __user *oldname,
4440  const char __user *newname,
4441  bool preserve)
4442 {
4443  struct dentry *new_dentry;
4444  struct path old_path, new_path;
4445  int error;
4446 
4447  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4448  return -EOPNOTSUPP;
4449 
4450  error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4451  if (error) {
4452  mlog_errno(error);
4453  return error;
4454  }
4455 
4456  new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
4457  error = PTR_ERR(new_dentry);
4458  if (IS_ERR(new_dentry)) {
4459  mlog_errno(error);
4460  goto out;
4461  }
4462 
4463  error = -EXDEV;
4464  if (old_path.mnt != new_path.mnt) {
4465  mlog_errno(error);
4466  goto out_dput;
4467  }
4468 
4469  error = ocfs2_vfs_reflink(old_path.dentry,
4470  new_path.dentry->d_inode,
4471  new_dentry, preserve);
4472 out_dput:
4473  done_path_create(&new_path, new_dentry);
4474 out:
4475  path_put(&old_path);
4476 
4477  return error;
4478 }