Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xfs_dfrag.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_log.h"
22 #include "xfs_trans.h"
23 #include "xfs_sb.h"
24 #include "xfs_ag.h"
25 #include "xfs_mount.h"
26 #include "xfs_bmap_btree.h"
27 #include "xfs_dinode.h"
28 #include "xfs_inode.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_bmap.h"
31 #include "xfs_itable.h"
32 #include "xfs_dfrag.h"
33 #include "xfs_error.h"
34 #include "xfs_vnodeops.h"
35 #include "xfs_trace.h"
36 
37 
38 static int xfs_swap_extents(
39  xfs_inode_t *ip, /* target inode */
40  xfs_inode_t *tip, /* tmp inode */
41  xfs_swapext_t *sxp);
42 
43 /*
44  * ioctl interface for swapext
45  */
46 int
48  xfs_swapext_t *sxp)
49 {
50  xfs_inode_t *ip, *tip;
51  struct fd f, tmp;
52  int error = 0;
53 
54  /* Pull information for the target fd */
55  f = fdget((int)sxp->sx_fdtarget);
56  if (!f.file) {
57  error = XFS_ERROR(EINVAL);
58  goto out;
59  }
60 
61  if (!(f.file->f_mode & FMODE_WRITE) ||
62  !(f.file->f_mode & FMODE_READ) ||
63  (f.file->f_flags & O_APPEND)) {
64  error = XFS_ERROR(EBADF);
65  goto out_put_file;
66  }
67 
68  tmp = fdget((int)sxp->sx_fdtmp);
69  if (!tmp.file) {
70  error = XFS_ERROR(EINVAL);
71  goto out_put_file;
72  }
73 
74  if (!(tmp.file->f_mode & FMODE_WRITE) ||
75  !(tmp.file->f_mode & FMODE_READ) ||
76  (tmp.file->f_flags & O_APPEND)) {
77  error = XFS_ERROR(EBADF);
78  goto out_put_tmp_file;
79  }
80 
81  if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
82  IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
83  error = XFS_ERROR(EINVAL);
84  goto out_put_tmp_file;
85  }
86 
87  ip = XFS_I(f.file->f_path.dentry->d_inode);
88  tip = XFS_I(tmp.file->f_path.dentry->d_inode);
89 
90  if (ip->i_mount != tip->i_mount) {
91  error = XFS_ERROR(EINVAL);
92  goto out_put_tmp_file;
93  }
94 
95  if (ip->i_ino == tip->i_ino) {
96  error = XFS_ERROR(EINVAL);
97  goto out_put_tmp_file;
98  }
99 
100  if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
101  error = XFS_ERROR(EIO);
102  goto out_put_tmp_file;
103  }
104 
105  error = xfs_swap_extents(ip, tip, sxp);
106 
107  out_put_tmp_file:
108  fdput(tmp);
109  out_put_file:
110  fdput(f);
111  out:
112  return error;
113 }
114 
115 /*
116  * We need to check that the format of the data fork in the temporary inode is
117  * valid for the target inode before doing the swap. This is not a problem with
118  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
119  * data fork depending on the space the attribute fork is taking so we can get
120  * invalid formats on the target inode.
121  *
122  * E.g. target has space for 7 extents in extent format, temp inode only has
123  * space for 6. If we defragment down to 7 extents, then the tmp format is a
124  * btree, but when swapped it needs to be in extent format. Hence we can't just
125  * blindly swap data forks on attr2 filesystems.
126  *
127  * Note that we check the swap in both directions so that we don't end up with
128  * a corrupt temporary inode, either.
129  *
130  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
131  * inode will prevent this situation from occurring, so all we do here is
132  * reject and log the attempt. basically we are putting the responsibility on
133  * userspace to get this right.
134  */
135 static int
136 xfs_swap_extents_check_format(
137  xfs_inode_t *ip, /* target inode */
138  xfs_inode_t *tip) /* tmp inode */
139 {
140 
141  /* Should never get a local format */
142  if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
143  tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
144  return EINVAL;
145 
146  /*
147  * if the target inode has less extents that then temporary inode then
148  * why did userspace call us?
149  */
150  if (ip->i_d.di_nextents < tip->i_d.di_nextents)
151  return EINVAL;
152 
153  /*
154  * if the target inode is in extent form and the temp inode is in btree
155  * form then we will end up with the target inode in the wrong format
156  * as we already know there are less extents in the temp inode.
157  */
158  if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
159  tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
160  return EINVAL;
161 
162  /* Check temp in extent form to max in target */
163  if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
166  return EINVAL;
167 
168  /* Check target in extent form to max in temp */
169  if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
172  return EINVAL;
173 
174  /*
175  * If we are in a btree format, check that the temp root block will fit
176  * in the target and that it has enough extents to be in btree format
177  * in the target.
178  *
179  * Note that we have to be careful to allow btree->extent conversions
180  * (a common defrag case) which will occur when the temp inode is in
181  * extent format...
182  */
183  if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184  if (XFS_IFORK_BOFF(ip) &&
185  tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
186  return EINVAL;
187  if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
189  return EINVAL;
190  }
191 
192  /* Reciprocal target->temp btree format checks */
193  if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
194  if (XFS_IFORK_BOFF(tip) &&
195  ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
196  return EINVAL;
197 
200  return EINVAL;
201  }
202 
203  return 0;
204 }
205 
206 static int
207 xfs_swap_extents(
208  xfs_inode_t *ip, /* target inode */
209  xfs_inode_t *tip, /* tmp inode */
210  xfs_swapext_t *sxp)
211 {
212  xfs_mount_t *mp = ip->i_mount;
213  xfs_trans_t *tp;
214  xfs_bstat_t *sbp = &sxp->sx_stat;
215  xfs_ifork_t *tempifp, *ifp, *tifp;
216  int src_log_flags, target_log_flags;
217  int error = 0;
218  int aforkblks = 0;
219  int taforkblks = 0;
220  __uint64_t tmp;
221 
222  tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
223  if (!tempifp) {
224  error = XFS_ERROR(ENOMEM);
225  goto out;
226  }
227 
228  /*
229  * we have to do two separate lock calls here to keep lockdep
230  * happy. If we try to get all the locks in one call, lock will
231  * report false positives when we drop the ILOCK and regain them
232  * below.
233  */
234  xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
235  xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
236 
237  /* Verify that both files have the same format */
238  if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
239  error = XFS_ERROR(EINVAL);
240  goto out_unlock;
241  }
242 
243  /* Verify both files are either real-time or non-realtime */
245  error = XFS_ERROR(EINVAL);
246  goto out_unlock;
247  }
248 
249  if (VN_CACHED(VFS_I(tip)) != 0) {
250  error = xfs_flushinval_pages(tip, 0, -1,
252  if (error)
253  goto out_unlock;
254  }
255 
256  /* Verify O_DIRECT for ftmp */
257  if (VN_CACHED(VFS_I(tip)) != 0) {
258  error = XFS_ERROR(EINVAL);
259  goto out_unlock;
260  }
261 
262  /* Verify all data are being swapped */
263  if (sxp->sx_offset != 0 ||
264  sxp->sx_length != ip->i_d.di_size ||
265  sxp->sx_length != tip->i_d.di_size) {
266  error = XFS_ERROR(EFAULT);
267  goto out_unlock;
268  }
269 
270  trace_xfs_swap_extent_before(ip, 0);
271  trace_xfs_swap_extent_before(tip, 1);
272 
273  /* check inode formats now that data is flushed */
274  error = xfs_swap_extents_check_format(ip, tip);
275  if (error) {
276  xfs_notice(mp,
277  "%s: inode 0x%llx format is incompatible for exchanging.",
278  __func__, ip->i_ino);
279  goto out_unlock;
280  }
281 
282  /*
283  * Compare the current change & modify times with that
284  * passed in. If they differ, we abort this swap.
285  * This is the mechanism used to ensure the calling
286  * process that the file was not changed out from
287  * under it.
288  */
289  if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
290  (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
291  (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
292  (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
293  error = XFS_ERROR(EBUSY);
294  goto out_unlock;
295  }
296 
297  /* We need to fail if the file is memory mapped. Once we have tossed
298  * all existing pages, the page fault will have no option
299  * but to go to the filesystem for pages. By making the page fault call
300  * vop_read (or write in the case of autogrow) they block on the iolock
301  * until we have switched the extents.
302  */
303  if (VN_MAPPED(VFS_I(ip))) {
304  error = XFS_ERROR(EBUSY);
305  goto out_unlock;
306  }
307 
308  xfs_iunlock(ip, XFS_ILOCK_EXCL);
309  xfs_iunlock(tip, XFS_ILOCK_EXCL);
310 
311  /*
312  * There is a race condition here since we gave up the
313  * ilock. However, the data fork will not change since
314  * we have the iolock (locked for truncation too) so we
315  * are safe. We don't really care if non-io related
316  * fields change.
317  */
318 
319  xfs_tosspages(ip, 0, -1, FI_REMAPF);
320 
322  if ((error = xfs_trans_reserve(tp, 0,
323  XFS_ICHANGE_LOG_RES(mp), 0,
324  0, 0))) {
325  xfs_iunlock(ip, XFS_IOLOCK_EXCL);
326  xfs_iunlock(tip, XFS_IOLOCK_EXCL);
327  xfs_trans_cancel(tp, 0);
328  goto out;
329  }
330  xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
331 
332  /*
333  * Count the number of extended attribute blocks
334  */
335  if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
336  (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
337  error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
338  if (error)
339  goto out_trans_cancel;
340  }
341  if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
342  (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
343  error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
344  &taforkblks);
345  if (error)
346  goto out_trans_cancel;
347  }
348 
349  /*
350  * Swap the data forks of the inodes
351  */
352  ifp = &ip->i_df;
353  tifp = &tip->i_df;
354  *tempifp = *ifp; /* struct copy */
355  *ifp = *tifp; /* struct copy */
356  *tifp = *tempifp; /* struct copy */
357 
358  /*
359  * Fix the on-disk inode values
360  */
361  tmp = (__uint64_t)ip->i_d.di_nblocks;
362  ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
363  tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
364 
365  tmp = (__uint64_t) ip->i_d.di_nextents;
366  ip->i_d.di_nextents = tip->i_d.di_nextents;
367  tip->i_d.di_nextents = tmp;
368 
369  tmp = (__uint64_t) ip->i_d.di_format;
370  ip->i_d.di_format = tip->i_d.di_format;
371  tip->i_d.di_format = tmp;
372 
373  /*
374  * The extents in the source inode could still contain speculative
375  * preallocation beyond EOF (e.g. the file is open but not modified
376  * while defrag is in progress). In that case, we need to copy over the
377  * number of delalloc blocks the data fork in the source inode is
378  * tracking beyond EOF so that when the fork is truncated away when the
379  * temporary inode is unlinked we don't underrun the i_delayed_blks
380  * counter on that inode.
381  */
382  ASSERT(tip->i_delayed_blks == 0);
383  tip->i_delayed_blks = ip->i_delayed_blks;
384  ip->i_delayed_blks = 0;
385 
386  src_log_flags = XFS_ILOG_CORE;
387  switch (ip->i_d.di_format) {
389  /* If the extents fit in the inode, fix the
390  * pointer. Otherwise it's already NULL or
391  * pointing to the extent.
392  */
393  if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
394  ifp->if_u1.if_extents =
395  ifp->if_u2.if_inline_ext;
396  }
397  src_log_flags |= XFS_ILOG_DEXT;
398  break;
400  src_log_flags |= XFS_ILOG_DBROOT;
401  break;
402  }
403 
404  target_log_flags = XFS_ILOG_CORE;
405  switch (tip->i_d.di_format) {
407  /* If the extents fit in the inode, fix the
408  * pointer. Otherwise it's already NULL or
409  * pointing to the extent.
410  */
411  if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
412  tifp->if_u1.if_extents =
413  tifp->if_u2.if_inline_ext;
414  }
415  target_log_flags |= XFS_ILOG_DEXT;
416  break;
418  target_log_flags |= XFS_ILOG_DBROOT;
419  break;
420  }
421 
422 
423  xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
424  xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
425 
426  xfs_trans_log_inode(tp, ip, src_log_flags);
427  xfs_trans_log_inode(tp, tip, target_log_flags);
428 
429  /*
430  * If this is a synchronous mount, make sure that the
431  * transaction goes to disk before returning to the user.
432  */
433  if (mp->m_flags & XFS_MOUNT_WSYNC)
434  xfs_trans_set_sync(tp);
435 
436  error = xfs_trans_commit(tp, 0);
437 
438  trace_xfs_swap_extent_after(ip, 0);
439  trace_xfs_swap_extent_after(tip, 1);
440 out:
441  kmem_free(tempifp);
442  return error;
443 
444 out_unlock:
445  xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
446  xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
447  goto out;
448 
449 out_trans_cancel:
450  xfs_trans_cancel(tp, 0);
451  goto out_unlock;
452 }