Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
file.c
Go to the documentation of this file.
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * file.c
5  *
6  * File open, close, extend, truncate
7  *
8  * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 #include <linux/pagemap.h>
32 #include <linux/uio.h>
33 #include <linux/sched.h>
34 #include <linux/splice.h>
35 #include <linux/mount.h>
36 #include <linux/writeback.h>
37 #include <linux/falloc.h>
38 #include <linux/quotaops.h>
39 #include <linux/blkdev.h>
40 
41 #include <cluster/masklog.h>
42 
43 #include "ocfs2.h"
44 
45 #include "alloc.h"
46 #include "aops.h"
47 #include "dir.h"
48 #include "dlmglue.h"
49 #include "extent_map.h"
50 #include "file.h"
51 #include "sysfile.h"
52 #include "inode.h"
53 #include "ioctl.h"
54 #include "journal.h"
55 #include "locks.h"
56 #include "mmap.h"
57 #include "suballoc.h"
58 #include "super.h"
59 #include "xattr.h"
60 #include "acl.h"
61 #include "quota.h"
62 #include "refcounttree.h"
63 #include "ocfs2_trace.h"
64 
65 #include "buffer_head_io.h"
66 
67 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68 {
69  struct ocfs2_file_private *fp;
70 
71  fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72  if (!fp)
73  return -ENOMEM;
74 
75  fp->fp_file = file;
76  mutex_init(&fp->fp_mutex);
78  file->private_data = fp;
79 
80  return 0;
81 }
82 
83 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84 {
85  struct ocfs2_file_private *fp = file->private_data;
86  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87 
88  if (fp) {
91  kfree(fp);
92  file->private_data = NULL;
93  }
94 }
95 
96 static int ocfs2_file_open(struct inode *inode, struct file *file)
97 {
98  int status;
99  int mode = file->f_flags;
100  struct ocfs2_inode_info *oi = OCFS2_I(inode);
101 
102  trace_ocfs2_file_open(inode, file, file->f_path.dentry,
103  (unsigned long long)OCFS2_I(inode)->ip_blkno,
104  file->f_path.dentry->d_name.len,
105  file->f_path.dentry->d_name.name, mode);
106 
107  if (file->f_mode & FMODE_WRITE)
108  dquot_initialize(inode);
109 
110  spin_lock(&oi->ip_lock);
111 
112  /* Check that the inode hasn't been wiped from disk by another
113  * node. If it hasn't then we're safe as long as we hold the
114  * spin lock until our increment of open count. */
115  if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
116  spin_unlock(&oi->ip_lock);
117 
118  status = -ENOENT;
119  goto leave;
120  }
121 
122  if (mode & O_DIRECT)
124 
125  oi->ip_open_count++;
126  spin_unlock(&oi->ip_lock);
127 
128  status = ocfs2_init_file_private(inode, file);
129  if (status) {
130  /*
131  * We want to set open count back if we're failing the
132  * open.
133  */
134  spin_lock(&oi->ip_lock);
135  oi->ip_open_count--;
136  spin_unlock(&oi->ip_lock);
137  }
138 
139 leave:
140  return status;
141 }
142 
143 static int ocfs2_file_release(struct inode *inode, struct file *file)
144 {
145  struct ocfs2_inode_info *oi = OCFS2_I(inode);
146 
147  spin_lock(&oi->ip_lock);
148  if (!--oi->ip_open_count)
150 
151  trace_ocfs2_file_release(inode, file, file->f_path.dentry,
152  oi->ip_blkno,
153  file->f_path.dentry->d_name.len,
154  file->f_path.dentry->d_name.name,
155  oi->ip_open_count);
156  spin_unlock(&oi->ip_lock);
157 
158  ocfs2_free_file_private(inode, file);
159 
160  return 0;
161 }
162 
163 static int ocfs2_dir_open(struct inode *inode, struct file *file)
164 {
165  return ocfs2_init_file_private(inode, file);
166 }
167 
168 static int ocfs2_dir_release(struct inode *inode, struct file *file)
169 {
170  ocfs2_free_file_private(inode, file);
171  return 0;
172 }
173 
174 static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175  int datasync)
176 {
177  int err = 0;
178  journal_t *journal;
179  struct inode *inode = file->f_mapping->host;
180  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
181 
182  trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183  OCFS2_I(inode)->ip_blkno,
184  file->f_path.dentry->d_name.len,
185  file->f_path.dentry->d_name.name,
186  (unsigned long long)datasync);
187 
188  err = filemap_write_and_wait_range(inode->i_mapping, start, end);
189  if (err)
190  return err;
191 
192  /*
193  * Probably don't need the i_mutex at all in here, just putting it here
194  * to be consistent with how fsync used to be called, someone more
195  * familiar with the fs could possibly remove it.
196  */
197  mutex_lock(&inode->i_mutex);
198  if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
199  /*
200  * We still have to flush drive's caches to get data to the
201  * platter
202  */
203  if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
204  blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
205  goto bail;
206  }
207 
208  journal = osb->journal->j_journal;
209  err = jbd2_journal_force_commit(journal);
210 
211 bail:
212  if (err)
213  mlog_errno(err);
214  mutex_unlock(&inode->i_mutex);
215 
216  return (err < 0) ? -EIO : 0;
217 }
218 
219 int ocfs2_should_update_atime(struct inode *inode,
220  struct vfsmount *vfsmnt)
221 {
222  struct timespec now;
223  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
224 
225  if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
226  return 0;
227 
228  if ((inode->i_flags & S_NOATIME) ||
229  ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
230  return 0;
231 
232  /*
233  * We can be called with no vfsmnt structure - NFSD will
234  * sometimes do this.
235  *
236  * Note that our action here is different than touch_atime() -
237  * if we can't tell whether this is a noatime mount, then we
238  * don't know whether to trust the value of s_atime_quantum.
239  */
240  if (vfsmnt == NULL)
241  return 0;
242 
243  if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
244  ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
245  return 0;
246 
247  if (vfsmnt->mnt_flags & MNT_RELATIME) {
248  if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
249  (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
250  return 1;
251 
252  return 0;
253  }
254 
255  now = CURRENT_TIME;
256  if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
257  return 0;
258  else
259  return 1;
260 }
261 
262 int ocfs2_update_inode_atime(struct inode *inode,
263  struct buffer_head *bh)
264 {
265  int ret;
266  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
267  handle_t *handle;
268  struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
269 
271  if (IS_ERR(handle)) {
272  ret = PTR_ERR(handle);
273  mlog_errno(ret);
274  goto out;
275  }
276 
277  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
279  if (ret) {
280  mlog_errno(ret);
281  goto out_commit;
282  }
283 
284  /*
285  * Don't use ocfs2_mark_inode_dirty() here as we don't always
286  * have i_mutex to guard against concurrent changes to other
287  * inode fields.
288  */
289  inode->i_atime = CURRENT_TIME;
290  di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
291  di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
292  ocfs2_journal_dirty(handle, bh);
293 
294 out_commit:
295  ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
296 out:
297  return ret;
298 }
299 
300 static int ocfs2_set_inode_size(handle_t *handle,
301  struct inode *inode,
302  struct buffer_head *fe_bh,
303  u64 new_i_size)
304 {
305  int status;
306 
307  i_size_write(inode, new_i_size);
308  inode->i_blocks = ocfs2_inode_sector_count(inode);
309  inode->i_ctime = inode->i_mtime = CURRENT_TIME;
310 
311  status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
312  if (status < 0) {
313  mlog_errno(status);
314  goto bail;
315  }
316 
317 bail:
318  return status;
319 }
320 
321 int ocfs2_simple_size_update(struct inode *inode,
322  struct buffer_head *di_bh,
323  u64 new_i_size)
324 {
325  int ret;
326  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
327  handle_t *handle = NULL;
328 
330  if (IS_ERR(handle)) {
331  ret = PTR_ERR(handle);
332  mlog_errno(ret);
333  goto out;
334  }
335 
336  ret = ocfs2_set_inode_size(handle, inode, di_bh,
337  new_i_size);
338  if (ret < 0)
339  mlog_errno(ret);
340 
341  ocfs2_commit_trans(osb, handle);
342 out:
343  return ret;
344 }
345 
346 static int ocfs2_cow_file_pos(struct inode *inode,
347  struct buffer_head *fe_bh,
348  u64 offset)
349 {
350  int status;
351  u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
352  unsigned int num_clusters = 0;
353  unsigned int ext_flags = 0;
354 
355  /*
356  * If the new offset is aligned to the range of the cluster, there is
357  * no space for ocfs2_zero_range_for_truncate to fill, so no need to
358  * CoW either.
359  */
360  if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
361  return 0;
362 
363  status = ocfs2_get_clusters(inode, cpos, &phys,
364  &num_clusters, &ext_flags);
365  if (status) {
366  mlog_errno(status);
367  goto out;
368  }
369 
370  if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371  goto out;
372 
373  return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374 
375 out:
376  return status;
377 }
378 
379 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
380  struct inode *inode,
381  struct buffer_head *fe_bh,
382  u64 new_i_size)
383 {
384  int status;
385  handle_t *handle;
386  struct ocfs2_dinode *di;
387  u64 cluster_bytes;
388 
389  /*
390  * We need to CoW the cluster contains the offset if it is reflinked
391  * since we will call ocfs2_zero_range_for_truncate later which will
392  * write "0" from offset to the end of the cluster.
393  */
394  status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
395  if (status) {
396  mlog_errno(status);
397  return status;
398  }
399 
400  /* TODO: This needs to actually orphan the inode in this
401  * transaction. */
402 
404  if (IS_ERR(handle)) {
405  status = PTR_ERR(handle);
406  mlog_errno(status);
407  goto out;
408  }
409 
410  status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
412  if (status < 0) {
413  mlog_errno(status);
414  goto out_commit;
415  }
416 
417  /*
418  * Do this before setting i_size.
419  */
420  cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
421  status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
422  cluster_bytes);
423  if (status) {
424  mlog_errno(status);
425  goto out_commit;
426  }
427 
428  i_size_write(inode, new_i_size);
429  inode->i_ctime = inode->i_mtime = CURRENT_TIME;
430 
431  di = (struct ocfs2_dinode *) fe_bh->b_data;
432  di->i_size = cpu_to_le64(new_i_size);
433  di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
434  di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
435 
436  ocfs2_journal_dirty(handle, fe_bh);
437 
438 out_commit:
439  ocfs2_commit_trans(osb, handle);
440 out:
441  return status;
442 }
443 
444 static int ocfs2_truncate_file(struct inode *inode,
445  struct buffer_head *di_bh,
446  u64 new_i_size)
447 {
448  int status = 0;
449  struct ocfs2_dinode *fe = NULL;
450  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
451 
452  /* We trust di_bh because it comes from ocfs2_inode_lock(), which
453  * already validated it */
454  fe = (struct ocfs2_dinode *) di_bh->b_data;
455 
456  trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
457  (unsigned long long)le64_to_cpu(fe->i_size),
458  (unsigned long long)new_i_size);
459 
460  mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
461  "Inode %llu, inode i_size = %lld != di "
462  "i_size = %llu, i_flags = 0x%x\n",
463  (unsigned long long)OCFS2_I(inode)->ip_blkno,
464  i_size_read(inode),
465  (unsigned long long)le64_to_cpu(fe->i_size),
466  le32_to_cpu(fe->i_flags));
467 
468  if (new_i_size > le64_to_cpu(fe->i_size)) {
469  trace_ocfs2_truncate_file_error(
470  (unsigned long long)le64_to_cpu(fe->i_size),
471  (unsigned long long)new_i_size);
472  status = -EINVAL;
473  mlog_errno(status);
474  goto bail;
475  }
476 
477  /* lets handle the simple truncate cases before doing any more
478  * cluster locking. */
479  if (new_i_size == le64_to_cpu(fe->i_size))
480  goto bail;
481 
482  down_write(&OCFS2_I(inode)->ip_alloc_sem);
483 
485  &OCFS2_I(inode)->ip_la_data_resv);
486 
487  /*
488  * The inode lock forced other nodes to sync and drop their
489  * pages, which (correctly) happens even if we have a truncate
490  * without allocation change - ocfs2 cluster sizes can be much
491  * greater than page size, so we have to truncate them
492  * anyway.
493  */
494  unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
495  truncate_inode_pages(inode->i_mapping, new_i_size);
496 
497  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
498  status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
499  i_size_read(inode), 1);
500  if (status)
501  mlog_errno(status);
502 
503  goto bail_unlock_sem;
504  }
505 
506  /* alright, we're going to need to do a full blown alloc size
507  * change. Orphan the inode so that recovery can complete the
508  * truncate if necessary. This does the task of marking
509  * i_size. */
510  status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
511  if (status < 0) {
512  mlog_errno(status);
513  goto bail_unlock_sem;
514  }
515 
516  status = ocfs2_commit_truncate(osb, inode, di_bh);
517  if (status < 0) {
518  mlog_errno(status);
519  goto bail_unlock_sem;
520  }
521 
522  /* TODO: orphan dir cleanup here. */
523 bail_unlock_sem:
524  up_write(&OCFS2_I(inode)->ip_alloc_sem);
525 
526 bail:
527  if (!status && OCFS2_I(inode)->ip_clusters == 0)
528  status = ocfs2_try_remove_refcount_tree(inode, di_bh);
529 
530  return status;
531 }
532 
533 /*
534  * extend file allocation only here.
535  * we'll update all the disk stuff, and oip->alloc_size
536  *
537  * expect stuff to be locked, a transaction started and enough data /
538  * metadata reservations in the contexts.
539  *
540  * Will return -EAGAIN, and a reason if a restart is needed.
541  * If passed in, *reason will always be set, even in error.
542  */
544  struct inode *inode,
545  u32 *logical_offset,
546  u32 clusters_to_add,
547  int mark_unwritten,
548  struct buffer_head *fe_bh,
549  handle_t *handle,
550  struct ocfs2_alloc_context *data_ac,
551  struct ocfs2_alloc_context *meta_ac,
552  enum ocfs2_alloc_restarted *reason_ret)
553 {
554  int ret;
555  struct ocfs2_extent_tree et;
556 
557  ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
558  ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
559  clusters_to_add, mark_unwritten,
560  data_ac, meta_ac, reason_ret);
561 
562  return ret;
563 }
564 
565 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
566  u32 clusters_to_add, int mark_unwritten)
567 {
568  int status = 0;
569  int restart_func = 0;
570  int credits;
571  u32 prev_clusters;
572  struct buffer_head *bh = NULL;
573  struct ocfs2_dinode *fe = NULL;
574  handle_t *handle = NULL;
575  struct ocfs2_alloc_context *data_ac = NULL;
576  struct ocfs2_alloc_context *meta_ac = NULL;
577  enum ocfs2_alloc_restarted why;
578  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
579  struct ocfs2_extent_tree et;
580  int did_quota = 0;
581 
582  /*
583  * This function only exists for file systems which don't
584  * support holes.
585  */
586  BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
587 
588  status = ocfs2_read_inode_block(inode, &bh);
589  if (status < 0) {
590  mlog_errno(status);
591  goto leave;
592  }
593  fe = (struct ocfs2_dinode *) bh->b_data;
594 
595 restart_all:
596  BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
597 
598  ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
599  status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
600  &data_ac, &meta_ac);
601  if (status) {
602  mlog_errno(status);
603  goto leave;
604  }
605 
606  credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
607  clusters_to_add);
608  handle = ocfs2_start_trans(osb, credits);
609  if (IS_ERR(handle)) {
610  status = PTR_ERR(handle);
611  handle = NULL;
612  mlog_errno(status);
613  goto leave;
614  }
615 
616 restarted_transaction:
617  trace_ocfs2_extend_allocation(
618  (unsigned long long)OCFS2_I(inode)->ip_blkno,
619  (unsigned long long)i_size_read(inode),
620  le32_to_cpu(fe->i_clusters), clusters_to_add,
621  why, restart_func);
622 
623  status = dquot_alloc_space_nodirty(inode,
624  ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
625  if (status)
626  goto leave;
627  did_quota = 1;
628 
629  /* reserve a write to the file entry early on - that we if we
630  * run out of credits in the allocation path, we can still
631  * update i_size. */
632  status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
634  if (status < 0) {
635  mlog_errno(status);
636  goto leave;
637  }
638 
639  prev_clusters = OCFS2_I(inode)->ip_clusters;
640 
641  status = ocfs2_add_inode_data(osb,
642  inode,
643  &logical_start,
644  clusters_to_add,
645  mark_unwritten,
646  bh,
647  handle,
648  data_ac,
649  meta_ac,
650  &why);
651  if ((status < 0) && (status != -EAGAIN)) {
652  if (status != -ENOSPC)
653  mlog_errno(status);
654  goto leave;
655  }
656 
657  ocfs2_journal_dirty(handle, bh);
658 
659  spin_lock(&OCFS2_I(inode)->ip_lock);
660  clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
661  spin_unlock(&OCFS2_I(inode)->ip_lock);
662  /* Release unused quota reservation */
663  dquot_free_space(inode,
664  ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
665  did_quota = 0;
666 
667  if (why != RESTART_NONE && clusters_to_add) {
668  if (why == RESTART_META) {
669  restart_func = 1;
670  status = 0;
671  } else {
672  BUG_ON(why != RESTART_TRANS);
673 
674  /* TODO: This can be more intelligent. */
675  credits = ocfs2_calc_extend_credits(osb->sb,
676  &fe->id2.i_list,
677  clusters_to_add);
678  status = ocfs2_extend_trans(handle, credits);
679  if (status < 0) {
680  /* handle still has to be committed at
681  * this point. */
682  status = -ENOMEM;
683  mlog_errno(status);
684  goto leave;
685  }
686  goto restarted_transaction;
687  }
688  }
689 
690  trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
691  le32_to_cpu(fe->i_clusters),
692  (unsigned long long)le64_to_cpu(fe->i_size),
693  OCFS2_I(inode)->ip_clusters,
694  (unsigned long long)i_size_read(inode));
695 
696 leave:
697  if (status < 0 && did_quota)
698  dquot_free_space(inode,
699  ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
700  if (handle) {
701  ocfs2_commit_trans(osb, handle);
702  handle = NULL;
703  }
704  if (data_ac) {
705  ocfs2_free_alloc_context(data_ac);
706  data_ac = NULL;
707  }
708  if (meta_ac) {
709  ocfs2_free_alloc_context(meta_ac);
710  meta_ac = NULL;
711  }
712  if ((!status) && restart_func) {
713  restart_func = 0;
714  goto restart_all;
715  }
716  brelse(bh);
717  bh = NULL;
718 
719  return status;
720 }
721 
722 /*
723  * While a write will already be ordering the data, a truncate will not.
724  * Thus, we need to explicitly order the zeroed pages.
725  */
726 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
727 {
728  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
729  handle_t *handle = NULL;
730  int ret = 0;
731 
732  if (!ocfs2_should_order_data(inode))
733  goto out;
734 
736  if (IS_ERR(handle)) {
737  ret = -ENOMEM;
738  mlog_errno(ret);
739  goto out;
740  }
741 
742  ret = ocfs2_jbd2_file_inode(handle, inode);
743  if (ret < 0)
744  mlog_errno(ret);
745 
746 out:
747  if (ret) {
748  if (!IS_ERR(handle))
749  ocfs2_commit_trans(osb, handle);
750  handle = ERR_PTR(ret);
751  }
752  return handle;
753 }
754 
755 /* Some parts of this taken from generic_cont_expand, which turned out
756  * to be too fragile to do exactly what we need without us having to
757  * worry about recursive locking in ->write_begin() and ->write_end(). */
758 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
759  u64 abs_to)
760 {
761  struct address_space *mapping = inode->i_mapping;
762  struct page *page;
763  unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
764  handle_t *handle = NULL;
765  int ret = 0;
766  unsigned zero_from, zero_to, block_start, block_end;
767 
768  BUG_ON(abs_from >= abs_to);
769  BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770  BUG_ON(abs_from & (inode->i_blkbits - 1));
771 
772  page = find_or_create_page(mapping, index, GFP_NOFS);
773  if (!page) {
774  ret = -ENOMEM;
775  mlog_errno(ret);
776  goto out;
777  }
778 
779  /* Get the offsets within the page that we want to zero */
780  zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
781  zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
782  if (!zero_to)
783  zero_to = PAGE_CACHE_SIZE;
784 
785  trace_ocfs2_write_zero_page(
786  (unsigned long long)OCFS2_I(inode)->ip_blkno,
787  (unsigned long long)abs_from,
788  (unsigned long long)abs_to,
789  index, zero_from, zero_to);
790 
791  /* We know that zero_from is block aligned */
792  for (block_start = zero_from; block_start < zero_to;
793  block_start = block_end) {
794  block_end = block_start + (1 << inode->i_blkbits);
795 
796  /*
797  * block_start is block-aligned. Bump it by one to force
798  * __block_write_begin and block_commit_write to zero the
799  * whole block.
800  */
801  ret = __block_write_begin(page, block_start + 1, 0,
803  if (ret < 0) {
804  mlog_errno(ret);
805  goto out_unlock;
806  }
807 
808  if (!handle) {
809  handle = ocfs2_zero_start_ordered_transaction(inode);
810  if (IS_ERR(handle)) {
811  ret = PTR_ERR(handle);
812  handle = NULL;
813  break;
814  }
815  }
816 
817  /* must not update i_size! */
818  ret = block_commit_write(page, block_start + 1,
819  block_start + 1);
820  if (ret < 0)
821  mlog_errno(ret);
822  else
823  ret = 0;
824  }
825 
826  if (handle)
827  ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
828 
829 out_unlock:
830  unlock_page(page);
831  page_cache_release(page);
832 out:
833  return ret;
834 }
835 
836 /*
837  * Find the next range to zero. We do this in terms of bytes because
838  * that's what ocfs2_zero_extend() wants, and it is dealing with the
839  * pagecache. We may return multiple extents.
840  *
841  * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
842  * needs to be zeroed. range_start and range_end return the next zeroing
843  * range. A subsequent call should pass the previous range_end as its
844  * zero_start. If range_end is 0, there's nothing to do.
845  *
846  * Unwritten extents are skipped over. Refcounted extents are CoWd.
847  */
848 static int ocfs2_zero_extend_get_range(struct inode *inode,
849  struct buffer_head *di_bh,
850  u64 zero_start, u64 zero_end,
851  u64 *range_start, u64 *range_end)
852 {
853  int rc = 0, needs_cow = 0;
854  u32 p_cpos, zero_clusters = 0;
855  u32 zero_cpos =
856  zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
857  u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
858  unsigned int num_clusters = 0;
859  unsigned int ext_flags = 0;
860 
861  while (zero_cpos < last_cpos) {
862  rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
863  &num_clusters, &ext_flags);
864  if (rc) {
865  mlog_errno(rc);
866  goto out;
867  }
868 
869  if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
870  zero_clusters = num_clusters;
871  if (ext_flags & OCFS2_EXT_REFCOUNTED)
872  needs_cow = 1;
873  break;
874  }
875 
876  zero_cpos += num_clusters;
877  }
878  if (!zero_clusters) {
879  *range_end = 0;
880  goto out;
881  }
882 
883  while ((zero_cpos + zero_clusters) < last_cpos) {
884  rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
885  &p_cpos, &num_clusters,
886  &ext_flags);
887  if (rc) {
888  mlog_errno(rc);
889  goto out;
890  }
891 
892  if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
893  break;
894  if (ext_flags & OCFS2_EXT_REFCOUNTED)
895  needs_cow = 1;
896  zero_clusters += num_clusters;
897  }
898  if ((zero_cpos + zero_clusters) > last_cpos)
899  zero_clusters = last_cpos - zero_cpos;
900 
901  if (needs_cow) {
902  rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
903  zero_clusters, UINT_MAX);
904  if (rc) {
905  mlog_errno(rc);
906  goto out;
907  }
908  }
909 
910  *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
911  *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
912  zero_cpos + zero_clusters);
913 
914 out:
915  return rc;
916 }
917 
918 /*
919  * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
920  * has made sure that the entire range needs zeroing.
921  */
922 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
923  u64 range_end)
924 {
925  int rc = 0;
926  u64 next_pos;
927  u64 zero_pos = range_start;
928 
929  trace_ocfs2_zero_extend_range(
930  (unsigned long long)OCFS2_I(inode)->ip_blkno,
931  (unsigned long long)range_start,
932  (unsigned long long)range_end);
933  BUG_ON(range_start >= range_end);
934 
935  while (zero_pos < range_end) {
936  next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
937  if (next_pos > range_end)
938  next_pos = range_end;
939  rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
940  if (rc < 0) {
941  mlog_errno(rc);
942  break;
943  }
944  zero_pos = next_pos;
945 
946  /*
947  * Very large extends have the potential to lock up
948  * the cpu for extended periods of time.
949  */
950  cond_resched();
951  }
952 
953  return rc;
954 }
955 
956 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
957  loff_t zero_to_size)
958 {
959  int ret = 0;
960  u64 zero_start, range_start = 0, range_end = 0;
961  struct super_block *sb = inode->i_sb;
962 
963  zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
964  trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
965  (unsigned long long)zero_start,
966  (unsigned long long)i_size_read(inode));
967  while (zero_start < zero_to_size) {
968  ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
969  zero_to_size,
970  &range_start,
971  &range_end);
972  if (ret) {
973  mlog_errno(ret);
974  break;
975  }
976  if (!range_end)
977  break;
978  /* Trim the ends */
979  if (range_start < zero_start)
980  range_start = zero_start;
981  if (range_end > zero_to_size)
982  range_end = zero_to_size;
983 
984  ret = ocfs2_zero_extend_range(inode, range_start,
985  range_end);
986  if (ret) {
987  mlog_errno(ret);
988  break;
989  }
990  zero_start = range_end;
991  }
992 
993  return ret;
994 }
995 
996 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
997  u64 new_i_size, u64 zero_to)
998 {
999  int ret;
1000  u32 clusters_to_add;
1001  struct ocfs2_inode_info *oi = OCFS2_I(inode);
1002 
1003  /*
1004  * Only quota files call this without a bh, and they can't be
1005  * refcounted.
1006  */
1007  BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1008  BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1009 
1010  clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1011  if (clusters_to_add < oi->ip_clusters)
1012  clusters_to_add = 0;
1013  else
1014  clusters_to_add -= oi->ip_clusters;
1015 
1016  if (clusters_to_add) {
1017  ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
1018  clusters_to_add, 0);
1019  if (ret) {
1020  mlog_errno(ret);
1021  goto out;
1022  }
1023  }
1024 
1025  /*
1026  * Call this even if we don't add any clusters to the tree. We
1027  * still need to zero the area between the old i_size and the
1028  * new i_size.
1029  */
1030  ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1031  if (ret < 0)
1032  mlog_errno(ret);
1033 
1034 out:
1035  return ret;
1036 }
1037 
1038 static int ocfs2_extend_file(struct inode *inode,
1039  struct buffer_head *di_bh,
1040  u64 new_i_size)
1041 {
1042  int ret = 0;
1043  struct ocfs2_inode_info *oi = OCFS2_I(inode);
1044 
1045  BUG_ON(!di_bh);
1046 
1047  /* setattr sometimes calls us like this. */
1048  if (new_i_size == 0)
1049  goto out;
1050 
1051  if (i_size_read(inode) == new_i_size)
1052  goto out;
1053  BUG_ON(new_i_size < i_size_read(inode));
1054 
1055  /*
1056  * The alloc sem blocks people in read/write from reading our
1057  * allocation until we're done changing it. We depend on
1058  * i_mutex to block other extend/truncate calls while we're
1059  * here. We even have to hold it for sparse files because there
1060  * might be some tail zeroing.
1061  */
1062  down_write(&oi->ip_alloc_sem);
1063 
1064  if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1065  /*
1066  * We can optimize small extends by keeping the inodes
1067  * inline data.
1068  */
1069  if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1070  up_write(&oi->ip_alloc_sem);
1071  goto out_update_size;
1072  }
1073 
1074  ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1075  if (ret) {
1076  up_write(&oi->ip_alloc_sem);
1077  mlog_errno(ret);
1078  goto out;
1079  }
1080  }
1081 
1082  if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1083  ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1084  else
1085  ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1086  new_i_size);
1087 
1088  up_write(&oi->ip_alloc_sem);
1089 
1090  if (ret < 0) {
1091  mlog_errno(ret);
1092  goto out;
1093  }
1094 
1095 out_update_size:
1096  ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1097  if (ret < 0)
1098  mlog_errno(ret);
1099 
1100 out:
1101  return ret;
1102 }
1103 
1104 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1105 {
1106  int status = 0, size_change;
1107  struct inode *inode = dentry->d_inode;
1108  struct super_block *sb = inode->i_sb;
1109  struct ocfs2_super *osb = OCFS2_SB(sb);
1110  struct buffer_head *bh = NULL;
1111  handle_t *handle = NULL;
1112  struct dquot *transfer_to[MAXQUOTAS] = { };
1113  int qtype;
1114 
1115  trace_ocfs2_setattr(inode, dentry,
1116  (unsigned long long)OCFS2_I(inode)->ip_blkno,
1117  dentry->d_name.len, dentry->d_name.name,
1118  attr->ia_valid, attr->ia_mode,
1119  attr->ia_uid, attr->ia_gid);
1120 
1121  /* ensuring we don't even attempt to truncate a symlink */
1122  if (S_ISLNK(inode->i_mode))
1123  attr->ia_valid &= ~ATTR_SIZE;
1124 
1125 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1126  | ATTR_GID | ATTR_UID | ATTR_MODE)
1127  if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1128  return 0;
1129 
1130  status = inode_change_ok(inode, attr);
1131  if (status)
1132  return status;
1133 
1134  if (is_quota_modification(inode, attr))
1135  dquot_initialize(inode);
1136  size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1137  if (size_change) {
1138  status = ocfs2_rw_lock(inode, 1);
1139  if (status < 0) {
1140  mlog_errno(status);
1141  goto bail;
1142  }
1143  }
1144 
1145  status = ocfs2_inode_lock(inode, &bh, 1);
1146  if (status < 0) {
1147  if (status != -ENOENT)
1148  mlog_errno(status);
1149  goto bail_unlock_rw;
1150  }
1151 
1152  if (size_change && attr->ia_size != i_size_read(inode)) {
1153  status = inode_newsize_ok(inode, attr->ia_size);
1154  if (status)
1155  goto bail_unlock;
1156 
1157  inode_dio_wait(inode);
1158 
1159  if (i_size_read(inode) > attr->ia_size) {
1160  if (ocfs2_should_order_data(inode)) {
1161  status = ocfs2_begin_ordered_truncate(inode,
1162  attr->ia_size);
1163  if (status)
1164  goto bail_unlock;
1165  }
1166  status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1167  } else
1168  status = ocfs2_extend_file(inode, bh, attr->ia_size);
1169  if (status < 0) {
1170  if (status != -ENOSPC)
1171  mlog_errno(status);
1172  status = -ENOSPC;
1173  goto bail_unlock;
1174  }
1175  }
1176 
1177  if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1178  (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1179  /*
1180  * Gather pointers to quota structures so that allocation /
1181  * freeing of quota structures happens here and not inside
1182  * dquot_transfer() where we have problems with lock ordering
1183  */
1184  if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1187  transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1188  if (!transfer_to[USRQUOTA]) {
1189  status = -ESRCH;
1190  goto bail_unlock;
1191  }
1192  }
1193  if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1196  transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1197  if (!transfer_to[GRPQUOTA]) {
1198  status = -ESRCH;
1199  goto bail_unlock;
1200  }
1201  }
1203  2 * ocfs2_quota_trans_credits(sb));
1204  if (IS_ERR(handle)) {
1205  status = PTR_ERR(handle);
1206  mlog_errno(status);
1207  goto bail_unlock;
1208  }
1209  status = __dquot_transfer(inode, transfer_to);
1210  if (status < 0)
1211  goto bail_commit;
1212  } else {
1214  if (IS_ERR(handle)) {
1215  status = PTR_ERR(handle);
1216  mlog_errno(status);
1217  goto bail_unlock;
1218  }
1219  }
1220 
1221  /*
1222  * This will intentionally not wind up calling truncate_setsize(),
1223  * since all the work for a size change has been done above.
1224  * Otherwise, we could get into problems with truncate as
1225  * ip_alloc_sem is used there to protect against i_size
1226  * changes.
1227  *
1228  * XXX: this means the conditional below can probably be removed.
1229  */
1230  if ((attr->ia_valid & ATTR_SIZE) &&
1231  attr->ia_size != i_size_read(inode)) {
1232  status = vmtruncate(inode, attr->ia_size);
1233  if (status) {
1234  mlog_errno(status);
1235  goto bail_commit;
1236  }
1237  }
1238 
1239  setattr_copy(inode, attr);
1240  mark_inode_dirty(inode);
1241 
1242  status = ocfs2_mark_inode_dirty(handle, inode, bh);
1243  if (status < 0)
1244  mlog_errno(status);
1245 
1246 bail_commit:
1247  ocfs2_commit_trans(osb, handle);
1248 bail_unlock:
1249  ocfs2_inode_unlock(inode, 1);
1250 bail_unlock_rw:
1251  if (size_change)
1252  ocfs2_rw_unlock(inode, 1);
1253 bail:
1254  brelse(bh);
1255 
1256  /* Release quota pointers in case we acquired them */
1257  for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1258  dqput(transfer_to[qtype]);
1259 
1260  if (!status && attr->ia_valid & ATTR_MODE) {
1261  status = ocfs2_acl_chmod(inode);
1262  if (status < 0)
1263  mlog_errno(status);
1264  }
1265 
1266  return status;
1267 }
1268 
1269 int ocfs2_getattr(struct vfsmount *mnt,
1270  struct dentry *dentry,
1271  struct kstat *stat)
1272 {
1273  struct inode *inode = dentry->d_inode;
1274  struct super_block *sb = dentry->d_inode->i_sb;
1275  struct ocfs2_super *osb = sb->s_fs_info;
1276  int err;
1277 
1278  err = ocfs2_inode_revalidate(dentry);
1279  if (err) {
1280  if (err != -ENOENT)
1281  mlog_errno(err);
1282  goto bail;
1283  }
1284 
1285  generic_fillattr(inode, stat);
1286 
1287  /* We set the blksize from the cluster size for performance */
1288  stat->blksize = osb->s_clustersize;
1289 
1290 bail:
1291  return err;
1292 }
1293 
1294 int ocfs2_permission(struct inode *inode, int mask)
1295 {
1296  int ret;
1297 
1298  if (mask & MAY_NOT_BLOCK)
1299  return -ECHILD;
1300 
1301  ret = ocfs2_inode_lock(inode, NULL, 0);
1302  if (ret) {
1303  if (ret != -ENOENT)
1304  mlog_errno(ret);
1305  goto out;
1306  }
1307 
1308  ret = generic_permission(inode, mask);
1309 
1310  ocfs2_inode_unlock(inode, 0);
1311 out:
1312  return ret;
1313 }
1314 
1315 static int __ocfs2_write_remove_suid(struct inode *inode,
1316  struct buffer_head *bh)
1317 {
1318  int ret;
1319  handle_t *handle;
1320  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1321  struct ocfs2_dinode *di;
1322 
1323  trace_ocfs2_write_remove_suid(
1324  (unsigned long long)OCFS2_I(inode)->ip_blkno,
1325  inode->i_mode);
1326 
1328  if (IS_ERR(handle)) {
1329  ret = PTR_ERR(handle);
1330  mlog_errno(ret);
1331  goto out;
1332  }
1333 
1334  ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1336  if (ret < 0) {
1337  mlog_errno(ret);
1338  goto out_trans;
1339  }
1340 
1341  inode->i_mode &= ~S_ISUID;
1342  if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1343  inode->i_mode &= ~S_ISGID;
1344 
1345  di = (struct ocfs2_dinode *) bh->b_data;
1346  di->i_mode = cpu_to_le16(inode->i_mode);
1347 
1348  ocfs2_journal_dirty(handle, bh);
1349 
1350 out_trans:
1351  ocfs2_commit_trans(osb, handle);
1352 out:
1353  return ret;
1354 }
1355 
1356 /*
1357  * Will look for holes and unwritten extents in the range starting at
1358  * pos for count bytes (inclusive).
1359  */
1360 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1361  size_t count)
1362 {
1363  int ret = 0;
1364  unsigned int extent_flags;
1365  u32 cpos, clusters, extent_len, phys_cpos;
1366  struct super_block *sb = inode->i_sb;
1367 
1368  cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1369  clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1370 
1371  while (clusters) {
1372  ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1373  &extent_flags);
1374  if (ret < 0) {
1375  mlog_errno(ret);
1376  goto out;
1377  }
1378 
1379  if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1380  ret = 1;
1381  break;
1382  }
1383 
1384  if (extent_len > clusters)
1385  extent_len = clusters;
1386 
1387  clusters -= extent_len;
1388  cpos += extent_len;
1389  }
1390 out:
1391  return ret;
1392 }
1393 
1394 static int ocfs2_write_remove_suid(struct inode *inode)
1395 {
1396  int ret;
1397  struct buffer_head *bh = NULL;
1398 
1399  ret = ocfs2_read_inode_block(inode, &bh);
1400  if (ret < 0) {
1401  mlog_errno(ret);
1402  goto out;
1403  }
1404 
1405  ret = __ocfs2_write_remove_suid(inode, bh);
1406 out:
1407  brelse(bh);
1408  return ret;
1409 }
1410 
1411 /*
1412  * Allocate enough extents to cover the region starting at byte offset
1413  * start for len bytes. Existing extents are skipped, any extents
1414  * added are marked as "unwritten".
1415  */
1416 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1417  u64 start, u64 len)
1418 {
1419  int ret;
1420  u32 cpos, phys_cpos, clusters, alloc_size;
1421  u64 end = start + len;
1422  struct buffer_head *di_bh = NULL;
1423 
1424  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1425  ret = ocfs2_read_inode_block(inode, &di_bh);
1426  if (ret) {
1427  mlog_errno(ret);
1428  goto out;
1429  }
1430 
1431  /*
1432  * Nothing to do if the requested reservation range
1433  * fits within the inode.
1434  */
1435  if (ocfs2_size_fits_inline_data(di_bh, end))
1436  goto out;
1437 
1438  ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1439  if (ret) {
1440  mlog_errno(ret);
1441  goto out;
1442  }
1443  }
1444 
1445  /*
1446  * We consider both start and len to be inclusive.
1447  */
1448  cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1449  clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1450  clusters -= cpos;
1451 
1452  while (clusters) {
1453  ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1454  &alloc_size, NULL);
1455  if (ret) {
1456  mlog_errno(ret);
1457  goto out;
1458  }
1459 
1460  /*
1461  * Hole or existing extent len can be arbitrary, so
1462  * cap it to our own allocation request.
1463  */
1464  if (alloc_size > clusters)
1465  alloc_size = clusters;
1466 
1467  if (phys_cpos) {
1468  /*
1469  * We already have an allocation at this
1470  * region so we can safely skip it.
1471  */
1472  goto next;
1473  }
1474 
1475  ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1476  if (ret) {
1477  if (ret != -ENOSPC)
1478  mlog_errno(ret);
1479  goto out;
1480  }
1481 
1482 next:
1483  cpos += alloc_size;
1484  clusters -= alloc_size;
1485  }
1486 
1487  ret = 0;
1488 out:
1489 
1490  brelse(di_bh);
1491  return ret;
1492 }
1493 
1494 /*
1495  * Truncate a byte range, avoiding pages within partial clusters. This
1496  * preserves those pages for the zeroing code to write to.
1497  */
1498 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1499  u64 byte_len)
1500 {
1501  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1502  loff_t start, end;
1503  struct address_space *mapping = inode->i_mapping;
1504 
1505  start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1506  end = byte_start + byte_len;
1507  end = end & ~(osb->s_clustersize - 1);
1508 
1509  if (start < end) {
1510  unmap_mapping_range(mapping, start, end - start, 0);
1511  truncate_inode_pages_range(mapping, start, end - 1);
1512  }
1513 }
1514 
1515 static int ocfs2_zero_partial_clusters(struct inode *inode,
1516  u64 start, u64 len)
1517 {
1518  int ret = 0;
1519  u64 tmpend, end = start + len;
1520  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1521  unsigned int csize = osb->s_clustersize;
1522  handle_t *handle;
1523 
1524  /*
1525  * The "start" and "end" values are NOT necessarily part of
1526  * the range whose allocation is being deleted. Rather, this
1527  * is what the user passed in with the request. We must zero
1528  * partial clusters here. There's no need to worry about
1529  * physical allocation - the zeroing code knows to skip holes.
1530  */
1531  trace_ocfs2_zero_partial_clusters(
1532  (unsigned long long)OCFS2_I(inode)->ip_blkno,
1533  (unsigned long long)start, (unsigned long long)end);
1534 
1535  /*
1536  * If both edges are on a cluster boundary then there's no
1537  * zeroing required as the region is part of the allocation to
1538  * be truncated.
1539  */
1540  if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1541  goto out;
1542 
1544  if (IS_ERR(handle)) {
1545  ret = PTR_ERR(handle);
1546  mlog_errno(ret);
1547  goto out;
1548  }
1549 
1550  /*
1551  * We want to get the byte offset of the end of the 1st cluster.
1552  */
1553  tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1554  if (tmpend > end)
1555  tmpend = end;
1556 
1557  trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
1558  (unsigned long long)tmpend);
1559 
1560  ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1561  if (ret)
1562  mlog_errno(ret);
1563 
1564  if (tmpend < end) {
1565  /*
1566  * This may make start and end equal, but the zeroing
1567  * code will skip any work in that case so there's no
1568  * need to catch it up here.
1569  */
1570  start = end & ~(osb->s_clustersize - 1);
1571 
1572  trace_ocfs2_zero_partial_clusters_range2(
1573  (unsigned long long)start, (unsigned long long)end);
1574 
1575  ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1576  if (ret)
1577  mlog_errno(ret);
1578  }
1579 
1580  ocfs2_commit_trans(osb, handle);
1581 out:
1582  return ret;
1583 }
1584 
1585 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1586 {
1587  int i;
1588  struct ocfs2_extent_rec *rec = NULL;
1589 
1590  for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1591 
1592  rec = &el->l_recs[i];
1593 
1594  if (le32_to_cpu(rec->e_cpos) < pos)
1595  break;
1596  }
1597 
1598  return i;
1599 }
1600 
1601 /*
1602  * Helper to calculate the punching pos and length in one run, we handle the
1603  * following three cases in order:
1604  *
1605  * - remove the entire record
1606  * - remove a partial record
1607  * - no record needs to be removed (hole-punching completed)
1608 */
1609 static void ocfs2_calc_trunc_pos(struct inode *inode,
1610  struct ocfs2_extent_list *el,
1611  struct ocfs2_extent_rec *rec,
1612  u32 trunc_start, u32 *trunc_cpos,
1613  u32 *trunc_len, u32 *trunc_end,
1614  u64 *blkno, int *done)
1615 {
1616  int ret = 0;
1617  u32 coff, range;
1618 
1619  range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1620 
1621  if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1622  /*
1623  * remove an entire extent record.
1624  */
1625  *trunc_cpos = le32_to_cpu(rec->e_cpos);
1626  /*
1627  * Skip holes if any.
1628  */
1629  if (range < *trunc_end)
1630  *trunc_end = range;
1631  *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1632  *blkno = le64_to_cpu(rec->e_blkno);
1633  *trunc_end = le32_to_cpu(rec->e_cpos);
1634  } else if (range > trunc_start) {
1635  /*
1636  * remove a partial extent record, which means we're
1637  * removing the last extent record.
1638  */
1639  *trunc_cpos = trunc_start;
1640  /*
1641  * skip hole if any.
1642  */
1643  if (range < *trunc_end)
1644  *trunc_end = range;
1645  *trunc_len = *trunc_end - trunc_start;
1646  coff = trunc_start - le32_to_cpu(rec->e_cpos);
1647  *blkno = le64_to_cpu(rec->e_blkno) +
1648  ocfs2_clusters_to_blocks(inode->i_sb, coff);
1649  *trunc_end = trunc_start;
1650  } else {
1651  /*
1652  * It may have two following possibilities:
1653  *
1654  * - last record has been removed
1655  * - trunc_start was within a hole
1656  *
1657  * both two cases mean the completion of hole punching.
1658  */
1659  ret = 1;
1660  }
1661 
1662  *done = ret;
1663 }
1664 
1665 static int ocfs2_remove_inode_range(struct inode *inode,
1666  struct buffer_head *di_bh, u64 byte_start,
1667  u64 byte_len)
1668 {
1669  int ret = 0, flags = 0, done = 0, i;
1670  u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1671  u32 cluster_in_el;
1672  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1673  struct ocfs2_cached_dealloc_ctxt dealloc;
1674  struct address_space *mapping = inode->i_mapping;
1675  struct ocfs2_extent_tree et;
1676  struct ocfs2_path *path = NULL;
1677  struct ocfs2_extent_list *el = NULL;
1678  struct ocfs2_extent_rec *rec = NULL;
1679  struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1680  u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1681 
1682  ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1683  ocfs2_init_dealloc_ctxt(&dealloc);
1684 
1685  trace_ocfs2_remove_inode_range(
1686  (unsigned long long)OCFS2_I(inode)->ip_blkno,
1687  (unsigned long long)byte_start,
1688  (unsigned long long)byte_len);
1689 
1690  if (byte_len == 0)
1691  return 0;
1692 
1693  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1694  ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1695  byte_start + byte_len, 0);
1696  if (ret) {
1697  mlog_errno(ret);
1698  goto out;
1699  }
1700  /*
1701  * There's no need to get fancy with the page cache
1702  * truncate of an inline-data inode. We're talking
1703  * about less than a page here, which will be cached
1704  * in the dinode buffer anyway.
1705  */
1706  unmap_mapping_range(mapping, 0, 0, 0);
1707  truncate_inode_pages(mapping, 0);
1708  goto out;
1709  }
1710 
1711  /*
1712  * For reflinks, we may need to CoW 2 clusters which might be
1713  * partially zero'd later, if hole's start and end offset were
1714  * within one cluster(means is not exactly aligned to clustersize).
1715  */
1716 
1717  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1718 
1719  ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1720  if (ret) {
1721  mlog_errno(ret);
1722  goto out;
1723  }
1724 
1725  ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1726  if (ret) {
1727  mlog_errno(ret);
1728  goto out;
1729  }
1730  }
1731 
1732  trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1733  trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1734  cluster_in_el = trunc_end;
1735 
1736  ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1737  if (ret) {
1738  mlog_errno(ret);
1739  goto out;
1740  }
1741 
1742  path = ocfs2_new_path_from_et(&et);
1743  if (!path) {
1744  ret = -ENOMEM;
1745  mlog_errno(ret);
1746  goto out;
1747  }
1748 
1749  while (trunc_end > trunc_start) {
1750 
1751  ret = ocfs2_find_path(INODE_CACHE(inode), path,
1752  cluster_in_el);
1753  if (ret) {
1754  mlog_errno(ret);
1755  goto out;
1756  }
1757 
1758  el = path_leaf_el(path);
1759 
1760  i = ocfs2_find_rec(el, trunc_end);
1761  /*
1762  * Need to go to previous extent block.
1763  */
1764  if (i < 0) {
1765  if (path->p_tree_depth == 0)
1766  break;
1767 
1768  ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1769  path,
1770  &cluster_in_el);
1771  if (ret) {
1772  mlog_errno(ret);
1773  goto out;
1774  }
1775 
1776  /*
1777  * We've reached the leftmost extent block,
1778  * it's safe to leave.
1779  */
1780  if (cluster_in_el == 0)
1781  break;
1782 
1783  /*
1784  * The 'pos' searched for previous extent block is
1785  * always one cluster less than actual trunc_end.
1786  */
1787  trunc_end = cluster_in_el + 1;
1788 
1789  ocfs2_reinit_path(path, 1);
1790 
1791  continue;
1792 
1793  } else
1794  rec = &el->l_recs[i];
1795 
1796  ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1797  &trunc_len, &trunc_end, &blkno, &done);
1798  if (done)
1799  break;
1800 
1801  flags = rec->e_flags;
1802  phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1803 
1804  ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1805  phys_cpos, trunc_len, flags,
1806  &dealloc, refcount_loc);
1807  if (ret < 0) {
1808  mlog_errno(ret);
1809  goto out;
1810  }
1811 
1812  cluster_in_el = trunc_end;
1813 
1814  ocfs2_reinit_path(path, 1);
1815  }
1816 
1817  ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1818 
1819 out:
1821  ocfs2_run_deallocs(osb, &dealloc);
1822 
1823  return ret;
1824 }
1825 
1826 /*
1827  * Parts of this function taken from xfs_change_file_space()
1828  */
1829 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1830  loff_t f_pos, unsigned int cmd,
1831  struct ocfs2_space_resv *sr,
1832  int change_size)
1833 {
1834  int ret;
1835  s64 llen;
1836  loff_t size;
1837  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1838  struct buffer_head *di_bh = NULL;
1839  handle_t *handle;
1840  unsigned long long max_off = inode->i_sb->s_maxbytes;
1841 
1842  if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1843  return -EROFS;
1844 
1845  mutex_lock(&inode->i_mutex);
1846 
1847  /*
1848  * This prevents concurrent writes on other nodes
1849  */
1850  ret = ocfs2_rw_lock(inode, 1);
1851  if (ret) {
1852  mlog_errno(ret);
1853  goto out;
1854  }
1855 
1856  ret = ocfs2_inode_lock(inode, &di_bh, 1);
1857  if (ret) {
1858  mlog_errno(ret);
1859  goto out_rw_unlock;
1860  }
1861 
1862  if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1863  ret = -EPERM;
1864  goto out_inode_unlock;
1865  }
1866 
1867  switch (sr->l_whence) {
1868  case 0: /*SEEK_SET*/
1869  break;
1870  case 1: /*SEEK_CUR*/
1871  sr->l_start += f_pos;
1872  break;
1873  case 2: /*SEEK_END*/
1874  sr->l_start += i_size_read(inode);
1875  break;
1876  default:
1877  ret = -EINVAL;
1878  goto out_inode_unlock;
1879  }
1880  sr->l_whence = 0;
1881 
1882  llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1883 
1884  if (sr->l_start < 0
1885  || sr->l_start > max_off
1886  || (sr->l_start + llen) < 0
1887  || (sr->l_start + llen) > max_off) {
1888  ret = -EINVAL;
1889  goto out_inode_unlock;
1890  }
1891  size = sr->l_start + sr->l_len;
1892 
1893  if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1894  if (sr->l_len <= 0) {
1895  ret = -EINVAL;
1896  goto out_inode_unlock;
1897  }
1898  }
1899 
1900  if (file && should_remove_suid(file->f_path.dentry)) {
1901  ret = __ocfs2_write_remove_suid(inode, di_bh);
1902  if (ret) {
1903  mlog_errno(ret);
1904  goto out_inode_unlock;
1905  }
1906  }
1907 
1908  down_write(&OCFS2_I(inode)->ip_alloc_sem);
1909  switch (cmd) {
1910  case OCFS2_IOC_RESVSP:
1911  case OCFS2_IOC_RESVSP64:
1912  /*
1913  * This takes unsigned offsets, but the signed ones we
1914  * pass have been checked against overflow above.
1915  */
1916  ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1917  sr->l_len);
1918  break;
1919  case OCFS2_IOC_UNRESVSP:
1920  case OCFS2_IOC_UNRESVSP64:
1921  ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1922  sr->l_len);
1923  break;
1924  default:
1925  ret = -EINVAL;
1926  }
1927  up_write(&OCFS2_I(inode)->ip_alloc_sem);
1928  if (ret) {
1929  mlog_errno(ret);
1930  goto out_inode_unlock;
1931  }
1932 
1933  /*
1934  * We update c/mtime for these changes
1935  */
1937  if (IS_ERR(handle)) {
1938  ret = PTR_ERR(handle);
1939  mlog_errno(ret);
1940  goto out_inode_unlock;
1941  }
1942 
1943  if (change_size && i_size_read(inode) < size)
1944  i_size_write(inode, size);
1945 
1946  inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1947  ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1948  if (ret < 0)
1949  mlog_errno(ret);
1950 
1951  if (file && (file->f_flags & O_SYNC))
1952  handle->h_sync = 1;
1953 
1954  ocfs2_commit_trans(osb, handle);
1955 
1956 out_inode_unlock:
1957  brelse(di_bh);
1958  ocfs2_inode_unlock(inode, 1);
1959 out_rw_unlock:
1960  ocfs2_rw_unlock(inode, 1);
1961 
1962 out:
1963  mutex_unlock(&inode->i_mutex);
1964  return ret;
1965 }
1966 
1967 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1968  struct ocfs2_space_resv *sr)
1969 {
1970  struct inode *inode = file->f_path.dentry->d_inode;
1971  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1972  int ret;
1973 
1974  if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1975  !ocfs2_writes_unwritten_extents(osb))
1976  return -ENOTTY;
1977  else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1978  !ocfs2_sparse_alloc(osb))
1979  return -ENOTTY;
1980 
1981  if (!S_ISREG(inode->i_mode))
1982  return -EINVAL;
1983 
1984  if (!(file->f_mode & FMODE_WRITE))
1985  return -EBADF;
1986 
1987  ret = mnt_want_write_file(file);
1988  if (ret)
1989  return ret;
1990  ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1991  mnt_drop_write_file(file);
1992  return ret;
1993 }
1994 
1995 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1996  loff_t len)
1997 {
1998  struct inode *inode = file->f_path.dentry->d_inode;
1999  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2000  struct ocfs2_space_resv sr;
2001  int change_size = 1;
2002  int cmd = OCFS2_IOC_RESVSP64;
2003 
2004  if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2005  return -EOPNOTSUPP;
2006  if (!ocfs2_writes_unwritten_extents(osb))
2007  return -EOPNOTSUPP;
2008 
2009  if (mode & FALLOC_FL_KEEP_SIZE)
2010  change_size = 0;
2011 
2012  if (mode & FALLOC_FL_PUNCH_HOLE)
2013  cmd = OCFS2_IOC_UNRESVSP64;
2014 
2015  sr.l_whence = 0;
2016  sr.l_start = (s64)offset;
2017  sr.l_len = (s64)len;
2018 
2019  return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2020  change_size);
2021 }
2022 
2023 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2024  size_t count)
2025 {
2026  int ret = 0;
2027  unsigned int extent_flags;
2028  u32 cpos, clusters, extent_len, phys_cpos;
2029  struct super_block *sb = inode->i_sb;
2030 
2031  if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2032  !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
2033  OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2034  return 0;
2035 
2036  cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2037  clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2038 
2039  while (clusters) {
2040  ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2041  &extent_flags);
2042  if (ret < 0) {
2043  mlog_errno(ret);
2044  goto out;
2045  }
2046 
2047  if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2048  ret = 1;
2049  break;
2050  }
2051 
2052  if (extent_len > clusters)
2053  extent_len = clusters;
2054 
2055  clusters -= extent_len;
2056  cpos += extent_len;
2057  }
2058 out:
2059  return ret;
2060 }
2061 
2062 static void ocfs2_aiodio_wait(struct inode *inode)
2063 {
2064  wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2065 
2066  wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2067 }
2068 
2069 static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2070 {
2071  int blockmask = inode->i_sb->s_blocksize - 1;
2072  loff_t final_size = pos + count;
2073 
2074  if ((pos & blockmask) || (final_size & blockmask))
2075  return 1;
2076  return 0;
2077 }
2078 
2079 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2080  struct file *file,
2081  loff_t pos, size_t count,
2082  int *meta_level)
2083 {
2084  int ret;
2085  struct buffer_head *di_bh = NULL;
2086  u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2087  u32 clusters =
2088  ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2089 
2090  ret = ocfs2_inode_lock(inode, &di_bh, 1);
2091  if (ret) {
2092  mlog_errno(ret);
2093  goto out;
2094  }
2095 
2096  *meta_level = 1;
2097 
2098  ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2099  if (ret)
2100  mlog_errno(ret);
2101 out:
2102  brelse(di_bh);
2103  return ret;
2104 }
2105 
2106 static int ocfs2_prepare_inode_for_write(struct file *file,
2107  loff_t *ppos,
2108  size_t count,
2109  int appending,
2110  int *direct_io,
2111  int *has_refcount)
2112 {
2113  int ret = 0, meta_level = 0;
2114  struct dentry *dentry = file->f_path.dentry;
2115  struct inode *inode = dentry->d_inode;
2116  loff_t saved_pos = 0, end;
2117 
2118  /*
2119  * We start with a read level meta lock and only jump to an ex
2120  * if we need to make modifications here.
2121  */
2122  for(;;) {
2123  ret = ocfs2_inode_lock(inode, NULL, meta_level);
2124  if (ret < 0) {
2125  meta_level = -1;
2126  mlog_errno(ret);
2127  goto out;
2128  }
2129 
2130  /* Clear suid / sgid if necessary. We do this here
2131  * instead of later in the write path because
2132  * remove_suid() calls ->setattr without any hint that
2133  * we may have already done our cluster locking. Since
2134  * ocfs2_setattr() *must* take cluster locks to
2135  * proceed, this will lead us to recursively lock the
2136  * inode. There's also the dinode i_size state which
2137  * can be lost via setattr during extending writes (we
2138  * set inode->i_size at the end of a write. */
2139  if (should_remove_suid(dentry)) {
2140  if (meta_level == 0) {
2141  ocfs2_inode_unlock(inode, meta_level);
2142  meta_level = 1;
2143  continue;
2144  }
2145 
2146  ret = ocfs2_write_remove_suid(inode);
2147  if (ret < 0) {
2148  mlog_errno(ret);
2149  goto out_unlock;
2150  }
2151  }
2152 
2153  /* work on a copy of ppos until we're sure that we won't have
2154  * to recalculate it due to relocking. */
2155  if (appending)
2156  saved_pos = i_size_read(inode);
2157  else
2158  saved_pos = *ppos;
2159 
2160  end = saved_pos + count;
2161 
2162  ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
2163  if (ret == 1) {
2164  ocfs2_inode_unlock(inode, meta_level);
2165  meta_level = -1;
2166 
2167  ret = ocfs2_prepare_inode_for_refcount(inode,
2168  file,
2169  saved_pos,
2170  count,
2171  &meta_level);
2172  if (has_refcount)
2173  *has_refcount = 1;
2174  if (direct_io)
2175  *direct_io = 0;
2176  }
2177 
2178  if (ret < 0) {
2179  mlog_errno(ret);
2180  goto out_unlock;
2181  }
2182 
2183  /*
2184  * Skip the O_DIRECT checks if we don't need
2185  * them.
2186  */
2187  if (!direct_io || !(*direct_io))
2188  break;
2189 
2190  /*
2191  * There's no sane way to do direct writes to an inode
2192  * with inline data.
2193  */
2194  if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2195  *direct_io = 0;
2196  break;
2197  }
2198 
2199  /*
2200  * Allowing concurrent direct writes means
2201  * i_size changes wouldn't be synchronized, so
2202  * one node could wind up truncating another
2203  * nodes writes.
2204  */
2205  if (end > i_size_read(inode)) {
2206  *direct_io = 0;
2207  break;
2208  }
2209 
2210  /*
2211  * We don't fill holes during direct io, so
2212  * check for them here. If any are found, the
2213  * caller will have to retake some cluster
2214  * locks and initiate the io as buffered.
2215  */
2216  ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2217  if (ret == 1) {
2218  *direct_io = 0;
2219  ret = 0;
2220  } else if (ret < 0)
2221  mlog_errno(ret);
2222  break;
2223  }
2224 
2225  if (appending)
2226  *ppos = saved_pos;
2227 
2228 out_unlock:
2229  trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2230  saved_pos, appending, count,
2231  direct_io, has_refcount);
2232 
2233  if (meta_level >= 0)
2234  ocfs2_inode_unlock(inode, meta_level);
2235 
2236 out:
2237  return ret;
2238 }
2239 
2240 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2241  const struct iovec *iov,
2242  unsigned long nr_segs,
2243  loff_t pos)
2244 {
2245  int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
2246  int can_do_direct, has_refcount = 0;
2247  ssize_t written = 0;
2248  size_t ocount; /* original count */
2249  size_t count; /* after file limit checks */
2250  loff_t old_size, *ppos = &iocb->ki_pos;
2251  u32 old_clusters;
2252  struct file *file = iocb->ki_filp;
2253  struct inode *inode = file->f_path.dentry->d_inode;
2254  struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255  int full_coherency = !(osb->s_mount_opt &
2257  int unaligned_dio = 0;
2258 
2259  trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2260  (unsigned long long)OCFS2_I(inode)->ip_blkno,
2261  file->f_path.dentry->d_name.len,
2262  file->f_path.dentry->d_name.name,
2263  (unsigned int)nr_segs);
2264 
2265  if (iocb->ki_left == 0)
2266  return 0;
2267 
2268  sb_start_write(inode->i_sb);
2269 
2270  appending = file->f_flags & O_APPEND ? 1 : 0;
2271  direct_io = file->f_flags & O_DIRECT ? 1 : 0;
2272 
2273  mutex_lock(&inode->i_mutex);
2274 
2276 
2277 relock:
2278  /* to match setattr's i_mutex -> rw_lock ordering */
2279  if (direct_io) {
2280  have_alloc_sem = 1;
2281  /* communicate with ocfs2_dio_end_io */
2283  }
2284 
2285  /*
2286  * Concurrent O_DIRECT writes are allowed with
2287  * mount_option "coherency=buffered".
2288  */
2289  rw_level = (!direct_io || full_coherency);
2290 
2291  ret = ocfs2_rw_lock(inode, rw_level);
2292  if (ret < 0) {
2293  mlog_errno(ret);
2294  goto out_sems;
2295  }
2296 
2297  /*
2298  * O_DIRECT writes with "coherency=full" need to take EX cluster
2299  * inode_lock to guarantee coherency.
2300  */
2301  if (direct_io && full_coherency) {
2302  /*
2303  * We need to take and drop the inode lock to force
2304  * other nodes to drop their caches. Buffered I/O
2305  * already does this in write_begin().
2306  */
2307  ret = ocfs2_inode_lock(inode, NULL, 1);
2308  if (ret < 0) {
2309  mlog_errno(ret);
2310  goto out_sems;
2311  }
2312 
2313  ocfs2_inode_unlock(inode, 1);
2314  }
2315 
2316  can_do_direct = direct_io;
2317  ret = ocfs2_prepare_inode_for_write(file, ppos,
2318  iocb->ki_left, appending,
2319  &can_do_direct, &has_refcount);
2320  if (ret < 0) {
2321  mlog_errno(ret);
2322  goto out;
2323  }
2324 
2325  if (direct_io && !is_sync_kiocb(iocb))
2326  unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2327  *ppos);
2328 
2329  /*
2330  * We can't complete the direct I/O as requested, fall back to
2331  * buffered I/O.
2332  */
2333  if (direct_io && !can_do_direct) {
2334  ocfs2_rw_unlock(inode, rw_level);
2335 
2336  have_alloc_sem = 0;
2337  rw_level = -1;
2338 
2339  direct_io = 0;
2340  goto relock;
2341  }
2342 
2343  if (unaligned_dio) {
2344  /*
2345  * Wait on previous unaligned aio to complete before
2346  * proceeding.
2347  */
2348  ocfs2_aiodio_wait(inode);
2349 
2350  /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351  atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2353  }
2354 
2355  /*
2356  * To later detect whether a journal commit for sync writes is
2357  * necessary, we sample i_size, and cluster count here.
2358  */
2359  old_size = i_size_read(inode);
2360  old_clusters = OCFS2_I(inode)->ip_clusters;
2361 
2362  /* communicate with ocfs2_dio_end_io */
2363  ocfs2_iocb_set_rw_locked(iocb, rw_level);
2364 
2365  ret = generic_segment_checks(iov, &nr_segs, &ocount,
2366  VERIFY_READ);
2367  if (ret)
2368  goto out_dio;
2369 
2370  count = ocount;
2371  ret = generic_write_checks(file, ppos, &count,
2372  S_ISBLK(inode->i_mode));
2373  if (ret)
2374  goto out_dio;
2375 
2376  if (direct_io) {
2377  written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2378  ppos, count, ocount);
2379  if (written < 0) {
2380  ret = written;
2381  goto out_dio;
2382  }
2383  } else {
2384  current->backing_dev_info = file->f_mapping->backing_dev_info;
2385  written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2386  ppos, count, 0);
2387  current->backing_dev_info = NULL;
2388  }
2389 
2390 out_dio:
2391  /* buffered aio wouldn't have proper lock coverage today */
2392  BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2393 
2394  if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2395  ((file->f_flags & O_DIRECT) && !direct_io)) {
2396  ret = filemap_fdatawrite_range(file->f_mapping, pos,
2397  pos + count - 1);
2398  if (ret < 0)
2399  written = ret;
2400 
2401  if (!ret && ((old_size != i_size_read(inode)) ||
2402  (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2403  has_refcount)) {
2404  ret = jbd2_journal_force_commit(osb->journal->j_journal);
2405  if (ret < 0)
2406  written = ret;
2407  }
2408 
2409  if (!ret)
2410  ret = filemap_fdatawait_range(file->f_mapping, pos,
2411  pos + count - 1);
2412  }
2413 
2414  /*
2415  * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2416  * function pointer which is called when o_direct io completes so that
2417  * it can unlock our rw lock.
2418  * Unfortunately there are error cases which call end_io and others
2419  * that don't. so we don't have to unlock the rw_lock if either an
2420  * async dio is going to do it in the future or an end_io after an
2421  * error has already done it.
2422  */
2423  if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2424  rw_level = -1;
2425  have_alloc_sem = 0;
2426  unaligned_dio = 0;
2427  }
2428 
2429  if (unaligned_dio) {
2431  atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2432  }
2433 
2434 out:
2435  if (rw_level != -1)
2436  ocfs2_rw_unlock(inode, rw_level);
2437 
2438 out_sems:
2439  if (have_alloc_sem)
2441 
2442  mutex_unlock(&inode->i_mutex);
2443  sb_end_write(inode->i_sb);
2444 
2445  if (written)
2446  ret = written;
2447  return ret;
2448 }
2449 
2450 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2451  struct file *out,
2452  struct splice_desc *sd)
2453 {
2454  int ret;
2455 
2456  ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2457  sd->total_len, 0, NULL, NULL);
2458  if (ret < 0) {
2459  mlog_errno(ret);
2460  return ret;
2461  }
2462 
2463  return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2464 }
2465 
2466 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2467  struct file *out,
2468  loff_t *ppos,
2469  size_t len,
2470  unsigned int flags)
2471 {
2472  int ret;
2473  struct address_space *mapping = out->f_mapping;
2474  struct inode *inode = mapping->host;
2475  struct splice_desc sd = {
2476  .total_len = len,
2477  .flags = flags,
2478  .pos = *ppos,
2479  .u.file = out,
2480  };
2481 
2482 
2483  trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2484  (unsigned long long)OCFS2_I(inode)->ip_blkno,
2485  out->f_path.dentry->d_name.len,
2486  out->f_path.dentry->d_name.name, len);
2487 
2488  if (pipe->inode)
2489  mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2490 
2492  do {
2493  ret = splice_from_pipe_next(pipe, &sd);
2494  if (ret <= 0)
2495  break;
2496 
2498  ret = ocfs2_rw_lock(inode, 1);
2499  if (ret < 0)
2500  mlog_errno(ret);
2501  else {
2502  ret = ocfs2_splice_to_file(pipe, out, &sd);
2503  ocfs2_rw_unlock(inode, 1);
2504  }
2505  mutex_unlock(&inode->i_mutex);
2506  } while (ret > 0);
2507  splice_from_pipe_end(pipe, &sd);
2508 
2509  if (pipe->inode)
2510  mutex_unlock(&pipe->inode->i_mutex);
2511 
2512  if (sd.num_spliced)
2513  ret = sd.num_spliced;
2514 
2515  if (ret > 0) {
2516  unsigned long nr_pages;
2517  int err;
2518 
2519  nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2520 
2521  err = generic_write_sync(out, *ppos, ret);
2522  if (err)
2523  ret = err;
2524  else
2525  *ppos += ret;
2526 
2527  balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2528  }
2529 
2530  return ret;
2531 }
2532 
2533 static ssize_t ocfs2_file_splice_read(struct file *in,
2534  loff_t *ppos,
2535  struct pipe_inode_info *pipe,
2536  size_t len,
2537  unsigned int flags)
2538 {
2539  int ret = 0, lock_level = 0;
2540  struct inode *inode = in->f_path.dentry->d_inode;
2541 
2542  trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2543  (unsigned long long)OCFS2_I(inode)->ip_blkno,
2544  in->f_path.dentry->d_name.len,
2545  in->f_path.dentry->d_name.name, len);
2546 
2547  /*
2548  * See the comment in ocfs2_file_aio_read()
2549  */
2550  ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2551  if (ret < 0) {
2552  mlog_errno(ret);
2553  goto bail;
2554  }
2555  ocfs2_inode_unlock(inode, lock_level);
2556 
2557  ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2558 
2559 bail:
2560  return ret;
2561 }
2562 
2563 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2564  const struct iovec *iov,
2565  unsigned long nr_segs,
2566  loff_t pos)
2567 {
2568  int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2569  struct file *filp = iocb->ki_filp;
2570  struct inode *inode = filp->f_path.dentry->d_inode;
2571 
2572  trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2573  (unsigned long long)OCFS2_I(inode)->ip_blkno,
2574  filp->f_path.dentry->d_name.len,
2575  filp->f_path.dentry->d_name.name, nr_segs);
2576 
2577 
2578  if (!inode) {
2579  ret = -EINVAL;
2580  mlog_errno(ret);
2581  goto bail;
2582  }
2583 
2585 
2586  /*
2587  * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2588  * need locks to protect pending reads from racing with truncate.
2589  */
2590  if (filp->f_flags & O_DIRECT) {
2591  have_alloc_sem = 1;
2593 
2594  ret = ocfs2_rw_lock(inode, 0);
2595  if (ret < 0) {
2596  mlog_errno(ret);
2597  goto bail;
2598  }
2599  rw_level = 0;
2600  /* communicate with ocfs2_dio_end_io */
2601  ocfs2_iocb_set_rw_locked(iocb, rw_level);
2602  }
2603 
2604  /*
2605  * We're fine letting folks race truncates and extending
2606  * writes with read across the cluster, just like they can
2607  * locally. Hence no rw_lock during read.
2608  *
2609  * Take and drop the meta data lock to update inode fields
2610  * like i_size. This allows the checks down below
2611  * generic_file_aio_read() a chance of actually working.
2612  */
2613  ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2614  if (ret < 0) {
2615  mlog_errno(ret);
2616  goto bail;
2617  }
2618  ocfs2_inode_unlock(inode, lock_level);
2619 
2620  ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2621  trace_generic_file_aio_read_ret(ret);
2622 
2623  /* buffered aio wouldn't have proper lock coverage today */
2624  BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2625 
2626  /* see ocfs2_file_aio_write */
2627  if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2628  rw_level = -1;
2629  have_alloc_sem = 0;
2630  }
2631 
2632 bail:
2633  if (have_alloc_sem)
2635 
2636  if (rw_level != -1)
2637  ocfs2_rw_unlock(inode, rw_level);
2638 
2639  return ret;
2640 }
2641 
2642 /* Refer generic_file_llseek_unlocked() */
2643 static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2644 {
2645  struct inode *inode = file->f_mapping->host;
2646  int ret = 0;
2647 
2648  mutex_lock(&inode->i_mutex);
2649 
2650  switch (origin) {
2651  case SEEK_SET:
2652  break;
2653  case SEEK_END:
2654  offset += inode->i_size;
2655  break;
2656  case SEEK_CUR:
2657  if (offset == 0) {
2658  offset = file->f_pos;
2659  goto out;
2660  }
2661  offset += file->f_pos;
2662  break;
2663  case SEEK_DATA:
2664  case SEEK_HOLE:
2665  ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
2666  if (ret)
2667  goto out;
2668  break;
2669  default:
2670  ret = -EINVAL;
2671  goto out;
2672  }
2673 
2674  if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2675  ret = -EINVAL;
2676  if (!ret && offset > inode->i_sb->s_maxbytes)
2677  ret = -EINVAL;
2678  if (ret)
2679  goto out;
2680 
2681  if (offset != file->f_pos) {
2682  file->f_pos = offset;
2683  file->f_version = 0;
2684  }
2685 
2686 out:
2687  mutex_unlock(&inode->i_mutex);
2688  if (ret)
2689  return ret;
2690  return offset;
2691 }
2692 
2694  .setattr = ocfs2_setattr,
2695  .getattr = ocfs2_getattr,
2696  .permission = ocfs2_permission,
2697  .setxattr = generic_setxattr,
2698  .getxattr = generic_getxattr,
2699  .listxattr = ocfs2_listxattr,
2700  .removexattr = generic_removexattr,
2701  .fiemap = ocfs2_fiemap,
2702  .get_acl = ocfs2_iop_get_acl,
2703 };
2704 
2706  .setattr = ocfs2_setattr,
2707  .getattr = ocfs2_getattr,
2708  .permission = ocfs2_permission,
2709  .get_acl = ocfs2_iop_get_acl,
2710 };
2711 
2712 /*
2713  * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2714  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2715  */
2716 const struct file_operations ocfs2_fops = {
2717  .llseek = ocfs2_file_llseek,
2718  .read = do_sync_read,
2719  .write = do_sync_write,
2720  .mmap = ocfs2_mmap,
2721  .fsync = ocfs2_sync_file,
2722  .release = ocfs2_file_release,
2723  .open = ocfs2_file_open,
2724  .aio_read = ocfs2_file_aio_read,
2725  .aio_write = ocfs2_file_aio_write,
2726  .unlocked_ioctl = ocfs2_ioctl,
2727 #ifdef CONFIG_COMPAT
2728  .compat_ioctl = ocfs2_compat_ioctl,
2729 #endif
2730  .lock = ocfs2_lock,
2731  .flock = ocfs2_flock,
2732  .splice_read = ocfs2_file_splice_read,
2733  .splice_write = ocfs2_file_splice_write,
2734  .fallocate = ocfs2_fallocate,
2735 };
2736 
2737 const struct file_operations ocfs2_dops = {
2738  .llseek = generic_file_llseek,
2739  .read = generic_read_dir,
2740  .readdir = ocfs2_readdir,
2741  .fsync = ocfs2_sync_file,
2742  .release = ocfs2_dir_release,
2743  .open = ocfs2_dir_open,
2744  .unlocked_ioctl = ocfs2_ioctl,
2745 #ifdef CONFIG_COMPAT
2746  .compat_ioctl = ocfs2_compat_ioctl,
2747 #endif
2748  .lock = ocfs2_lock,
2749  .flock = ocfs2_flock,
2750 };
2751 
2752 /*
2753  * POSIX-lockless variants of our file_operations.
2754  *
2755  * These will be used if the underlying cluster stack does not support
2756  * posix file locking, if the user passes the "localflocks" mount
2757  * option, or if we have a local-only fs.
2758  *
2759  * ocfs2_flock is in here because all stacks handle UNIX file locks,
2760  * so we still want it in the case of no stack support for
2761  * plocks. Internally, it will do the right thing when asked to ignore
2762  * the cluster.
2763  */
2765  .llseek = ocfs2_file_llseek,
2766  .read = do_sync_read,
2767  .write = do_sync_write,
2768  .mmap = ocfs2_mmap,
2769  .fsync = ocfs2_sync_file,
2770  .release = ocfs2_file_release,
2771  .open = ocfs2_file_open,
2772  .aio_read = ocfs2_file_aio_read,
2773  .aio_write = ocfs2_file_aio_write,
2774  .unlocked_ioctl = ocfs2_ioctl,
2775 #ifdef CONFIG_COMPAT
2776  .compat_ioctl = ocfs2_compat_ioctl,
2777 #endif
2778  .flock = ocfs2_flock,
2779  .splice_read = ocfs2_file_splice_read,
2780  .splice_write = ocfs2_file_splice_write,
2781  .fallocate = ocfs2_fallocate,
2782 };
2783 
2785  .llseek = generic_file_llseek,
2786  .read = generic_read_dir,
2787  .readdir = ocfs2_readdir,
2788  .fsync = ocfs2_sync_file,
2789  .release = ocfs2_dir_release,
2790  .open = ocfs2_dir_open,
2791  .unlocked_ioctl = ocfs2_ioctl,
2792 #ifdef CONFIG_COMPAT
2793  .compat_ioctl = ocfs2_compat_ioctl,
2794 #endif
2795  .flock = ocfs2_flock,
2796 };