Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xfs_aops.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include "xfs.h"
19 #include "xfs_log.h"
20 #include "xfs_sb.h"
21 #include "xfs_ag.h"
22 #include "xfs_trans.h"
23 #include "xfs_mount.h"
24 #include "xfs_bmap_btree.h"
25 #include "xfs_dinode.h"
26 #include "xfs_inode.h"
27 #include "xfs_inode_item.h"
28 #include "xfs_alloc.h"
29 #include "xfs_error.h"
30 #include "xfs_iomap.h"
31 #include "xfs_vnodeops.h"
32 #include "xfs_trace.h"
33 #include "xfs_bmap.h"
34 #include <linux/gfp.h>
35 #include <linux/mpage.h>
36 #include <linux/pagevec.h>
37 #include <linux/writeback.h>
38 
39 void
41  struct page *page,
42  int *delalloc,
43  int *unwritten)
44 {
45  struct buffer_head *bh, *head;
46 
47  *delalloc = *unwritten = 0;
48 
49  bh = head = page_buffers(page);
50  do {
51  if (buffer_unwritten(bh))
52  (*unwritten) = 1;
53  else if (buffer_delay(bh))
54  (*delalloc) = 1;
55  } while ((bh = bh->b_this_page) != head);
56 }
57 
58 STATIC struct block_device *
60  struct inode *inode)
61 {
62  struct xfs_inode *ip = XFS_I(inode);
63  struct xfs_mount *mp = ip->i_mount;
64 
65  if (XFS_IS_REALTIME_INODE(ip))
66  return mp->m_rtdev_targp->bt_bdev;
67  else
68  return mp->m_ddev_targp->bt_bdev;
69 }
70 
71 /*
72  * We're now finished for good with this ioend structure.
73  * Update the page state via the associated buffer_heads,
74  * release holds on the inode and bio, and finally free
75  * up memory. Do not use the ioend after this.
76  */
77 STATIC void
79  xfs_ioend_t *ioend)
80 {
81  struct buffer_head *bh, *next;
82 
83  for (bh = ioend->io_buffer_head; bh; bh = next) {
84  next = bh->b_private;
85  bh->b_end_io(bh, !ioend->io_error);
86  }
87 
88  if (ioend->io_iocb) {
89  if (ioend->io_isasync) {
90  aio_complete(ioend->io_iocb, ioend->io_error ?
91  ioend->io_error : ioend->io_result, 0);
92  }
93  inode_dio_done(ioend->io_inode);
94  }
95 
97 }
98 
99 /*
100  * Fast and loose check if this write could update the on-disk inode size.
101  */
102 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
103 {
104  return ioend->io_offset + ioend->io_size >
105  XFS_I(ioend->io_inode)->i_d.di_size;
106 }
107 
108 STATIC int
110  struct xfs_ioend *ioend)
111 {
112  struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
113  struct xfs_trans *tp;
114  int error;
115 
117 
118  error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
119  if (error) {
120  xfs_trans_cancel(tp, 0);
121  return error;
122  }
123 
124  ioend->io_append_trans = tp;
125 
126  /*
127  * We will pass freeze protection with a transaction. So tell lockdep
128  * we released it.
129  */
130  rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
131  1, _THIS_IP_);
132  /*
133  * We hand off the transaction to the completion thread now, so
134  * clear the flag here.
135  */
137  return 0;
138 }
139 
140 /*
141  * Update on-disk file size now that data has been written to disk.
142  */
143 STATIC int
145  struct xfs_ioend *ioend)
146 {
147  struct xfs_inode *ip = XFS_I(ioend->io_inode);
148  struct xfs_trans *tp = ioend->io_append_trans;
149  xfs_fsize_t isize;
150 
151  /*
152  * The transaction was allocated in the I/O submission thread,
153  * thus we need to mark ourselves as beeing in a transaction
154  * manually.
155  */
156  current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 
158  xfs_ilock(ip, XFS_ILOCK_EXCL);
159  isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
160  if (!isize) {
161  xfs_iunlock(ip, XFS_ILOCK_EXCL);
162  xfs_trans_cancel(tp, 0);
163  return 0;
164  }
165 
166  trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
167 
168  ip->i_d.di_size = isize;
169  xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
171 
172  return xfs_trans_commit(tp, 0);
173 }
174 
175 /*
176  * Schedule IO completion handling on the final put of an ioend.
177  *
178  * If there is no work to do we might as well call it a day and free the
179  * ioend right now.
180  */
181 STATIC void
183  struct xfs_ioend *ioend)
184 {
185  if (atomic_dec_and_test(&ioend->io_remaining)) {
186  struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
187 
188  if (ioend->io_type == XFS_IO_UNWRITTEN)
189  queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190  else if (ioend->io_append_trans)
191  queue_work(mp->m_data_workqueue, &ioend->io_work);
192  else
193  xfs_destroy_ioend(ioend);
194  }
195 }
196 
197 /*
198  * IO write completion.
199  */
200 STATIC void
202  struct work_struct *work)
203 {
204  xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
205  struct xfs_inode *ip = XFS_I(ioend->io_inode);
206  int error = 0;
207 
208  if (ioend->io_append_trans) {
209  /*
210  * We've got freeze protection passed with the transaction.
211  * Tell lockdep about it.
212  */
214  &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215  0, 1, _THIS_IP_);
216  }
217  if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218  ioend->io_error = -EIO;
219  goto done;
220  }
221  if (ioend->io_error)
222  goto done;
223 
224  /*
225  * For unwritten extents we need to issue transactions to convert a
226  * range to normal written extens after the data I/O has finished.
227  */
228  if (ioend->io_type == XFS_IO_UNWRITTEN) {
229  /*
230  * For buffered I/O we never preallocate a transaction when
231  * doing the unwritten extent conversion, but for direct I/O
232  * we do not know if we are converting an unwritten extent
233  * or not at the point where we preallocate the transaction.
234  */
235  if (ioend->io_append_trans) {
236  ASSERT(ioend->io_isdirect);
237 
239  &ioend->io_append_trans->t_pflags, PF_FSTRANS);
241  }
242 
243  error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244  ioend->io_size);
245  if (error) {
246  ioend->io_error = -error;
247  goto done;
248  }
249  } else if (ioend->io_append_trans) {
250  error = xfs_setfilesize(ioend);
251  if (error)
252  ioend->io_error = -error;
253  } else {
254  ASSERT(!xfs_ioend_is_append(ioend));
255  }
256 
257 done:
258  xfs_destroy_ioend(ioend);
259 }
260 
261 /*
262  * Call IO completion handling in caller context on the final put of an ioend.
263  */
264 STATIC void
266  struct xfs_ioend *ioend)
267 {
268  if (atomic_dec_and_test(&ioend->io_remaining))
269  xfs_end_io(&ioend->io_work);
270 }
271 
272 /*
273  * Allocate and initialise an IO completion structure.
274  * We need to track unwritten extent write completion here initially.
275  * We'll need to extend this for updating the ondisk inode size later
276  * (vs. incore size).
277  */
280  struct inode *inode,
281  unsigned int type)
282 {
283  xfs_ioend_t *ioend;
284 
286 
287  /*
288  * Set the count to 1 initially, which will prevent an I/O
289  * completion callback from happening before we have started
290  * all the I/O from calling the completion routine too early.
291  */
292  atomic_set(&ioend->io_remaining, 1);
293  ioend->io_isasync = 0;
294  ioend->io_isdirect = 0;
295  ioend->io_error = 0;
296  ioend->io_list = NULL;
297  ioend->io_type = type;
298  ioend->io_inode = inode;
299  ioend->io_buffer_head = NULL;
300  ioend->io_buffer_tail = NULL;
301  ioend->io_offset = 0;
302  ioend->io_size = 0;
303  ioend->io_iocb = NULL;
304  ioend->io_result = 0;
305  ioend->io_append_trans = NULL;
306 
307  INIT_WORK(&ioend->io_work, xfs_end_io);
308  return ioend;
309 }
310 
311 STATIC int
313  struct inode *inode,
314  loff_t offset,
315  struct xfs_bmbt_irec *imap,
316  int type,
317  int nonblocking)
318 {
319  struct xfs_inode *ip = XFS_I(inode);
320  struct xfs_mount *mp = ip->i_mount;
321  ssize_t count = 1 << inode->i_blkbits;
322  xfs_fileoff_t offset_fsb, end_fsb;
323  int error = 0;
324  int bmapi_flags = XFS_BMAPI_ENTIRE;
325  int nimaps = 1;
326 
327  if (XFS_FORCED_SHUTDOWN(mp))
328  return -XFS_ERROR(EIO);
329 
330  if (type == XFS_IO_UNWRITTEN)
331  bmapi_flags |= XFS_BMAPI_IGSTATE;
332 
333  if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
334  if (nonblocking)
335  return -XFS_ERROR(EAGAIN);
336  xfs_ilock(ip, XFS_ILOCK_SHARED);
337  }
338 
339  ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
340  (ip->i_df.if_flags & XFS_IFEXTENTS));
341  ASSERT(offset <= mp->m_super->s_maxbytes);
342 
343  if (offset + count > mp->m_super->s_maxbytes)
344  count = mp->m_super->s_maxbytes - offset;
345  end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
346  offset_fsb = XFS_B_TO_FSBT(mp, offset);
347  error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
348  imap, &nimaps, bmapi_flags);
349  xfs_iunlock(ip, XFS_ILOCK_SHARED);
350 
351  if (error)
352  return -XFS_ERROR(error);
353 
354  if (type == XFS_IO_DELALLOC &&
355  (!nimaps || isnullstartblock(imap->br_startblock))) {
356  error = xfs_iomap_write_allocate(ip, offset, count, imap);
357  if (!error)
358  trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
359  return -XFS_ERROR(error);
360  }
361 
362 #ifdef DEBUG
363  if (type == XFS_IO_UNWRITTEN) {
364  ASSERT(nimaps);
367  }
368 #endif
369  if (nimaps)
370  trace_xfs_map_blocks_found(ip, offset, count, type, imap);
371  return 0;
372 }
373 
374 STATIC int
376  struct inode *inode,
377  struct xfs_bmbt_irec *imap,
378  xfs_off_t offset)
379 {
380  offset >>= inode->i_blkbits;
381 
382  return offset >= imap->br_startoff &&
383  offset < imap->br_startoff + imap->br_blockcount;
384 }
385 
386 /*
387  * BIO completion handler for buffered IO.
388  */
389 STATIC void
391  struct bio *bio,
392  int error)
393 {
394  xfs_ioend_t *ioend = bio->bi_private;
395 
396  ASSERT(atomic_read(&bio->bi_cnt) >= 1);
397  ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
398 
399  /* Toss bio and pass work off to an xfsdatad thread */
400  bio->bi_private = NULL;
401  bio->bi_end_io = NULL;
402  bio_put(bio);
403 
404  xfs_finish_ioend(ioend);
405 }
406 
407 STATIC void
409  struct writeback_control *wbc,
410  xfs_ioend_t *ioend,
411  struct bio *bio)
412 {
413  atomic_inc(&ioend->io_remaining);
414  bio->bi_private = ioend;
415  bio->bi_end_io = xfs_end_bio;
416  submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
417 }
418 
419 STATIC struct bio *
421  struct buffer_head *bh)
422 {
423  int nvecs = bio_get_nr_vecs(bh->b_bdev);
424  struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
425 
426  ASSERT(bio->bi_private == NULL);
427  bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
428  bio->bi_bdev = bh->b_bdev;
429  return bio;
430 }
431 
432 STATIC void
434  struct buffer_head *bh)
435 {
436  ASSERT(buffer_mapped(bh));
437  ASSERT(buffer_locked(bh));
438  ASSERT(!buffer_delay(bh));
439  ASSERT(!buffer_unwritten(bh));
440 
442  set_buffer_uptodate(bh);
443  clear_buffer_dirty(bh);
444 }
445 
446 STATIC void
448  struct page *page,
449  int clear_dirty,
450  int buffers)
451 {
452  ASSERT(PageLocked(page));
453  ASSERT(!PageWriteback(page));
454  if (clear_dirty)
456  set_page_writeback(page);
457  unlock_page(page);
458  /* If no buffers on the page are to be written, finish it here */
459  if (!buffers)
460  end_page_writeback(page);
461 }
462 
463 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
464 {
465  return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
466 }
467 
468 /*
469  * Submit all of the bios for all of the ioends we have saved up, covering the
470  * initial writepage page and also any probed pages.
471  *
472  * Because we may have multiple ioends spanning a page, we need to start
473  * writeback on all the buffers before we submit them for I/O. If we mark the
474  * buffers as we got, then we can end up with a page that only has buffers
475  * marked async write and I/O complete on can occur before we mark the other
476  * buffers async write.
477  *
478  * The end result of this is that we trip a bug in end_page_writeback() because
479  * we call it twice for the one page as the code in end_buffer_async_write()
480  * assumes that all buffers on the page are started at the same time.
481  *
482  * The fix is two passes across the ioend list - one to start writeback on the
483  * buffer_heads, and then submit them for I/O on the second pass.
484  *
485  * If @fail is non-zero, it means that we have a situation where some part of
486  * the submission process has failed after we have marked paged for writeback
487  * and unlocked them. In this situation, we need to fail the ioend chain rather
488  * than submit it to IO. This typically only happens on a filesystem shutdown.
489  */
490 STATIC void
492  struct writeback_control *wbc,
493  xfs_ioend_t *ioend,
494  int fail)
495 {
496  xfs_ioend_t *head = ioend;
497  xfs_ioend_t *next;
498  struct buffer_head *bh;
499  struct bio *bio;
500  sector_t lastblock = 0;
501 
502  /* Pass 1 - start writeback */
503  do {
504  next = ioend->io_list;
505  for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
507  } while ((ioend = next) != NULL);
508 
509  /* Pass 2 - submit I/O */
510  ioend = head;
511  do {
512  next = ioend->io_list;
513  bio = NULL;
514 
515  /*
516  * If we are failing the IO now, just mark the ioend with an
517  * error and finish it. This will run IO completion immediately
518  * as there is only one reference to the ioend at this point in
519  * time.
520  */
521  if (fail) {
522  ioend->io_error = -fail;
523  xfs_finish_ioend(ioend);
524  continue;
525  }
526 
527  for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
528 
529  if (!bio) {
530  retry:
531  bio = xfs_alloc_ioend_bio(bh);
532  } else if (bh->b_blocknr != lastblock + 1) {
533  xfs_submit_ioend_bio(wbc, ioend, bio);
534  goto retry;
535  }
536 
537  if (bio_add_buffer(bio, bh) != bh->b_size) {
538  xfs_submit_ioend_bio(wbc, ioend, bio);
539  goto retry;
540  }
541 
542  lastblock = bh->b_blocknr;
543  }
544  if (bio)
545  xfs_submit_ioend_bio(wbc, ioend, bio);
546  xfs_finish_ioend(ioend);
547  } while ((ioend = next) != NULL);
548 }
549 
550 /*
551  * Cancel submission of all buffer_heads so far in this endio.
552  * Toss the endio too. Only ever called for the initial page
553  * in a writepage request, so only ever one page.
554  */
555 STATIC void
557  xfs_ioend_t *ioend)
558 {
559  xfs_ioend_t *next;
560  struct buffer_head *bh, *next_bh;
561 
562  do {
563  next = ioend->io_list;
564  bh = ioend->io_buffer_head;
565  do {
566  next_bh = bh->b_private;
567  clear_buffer_async_write(bh);
568  unlock_buffer(bh);
569  } while ((bh = next_bh) != NULL);
570 
572  } while ((ioend = next) != NULL);
573 }
574 
575 /*
576  * Test to see if we've been building up a completion structure for
577  * earlier buffers -- if so, we try to append to this ioend if we
578  * can, otherwise we finish off any current ioend and start another.
579  * Return true if we've finished the given ioend.
580  */
581 STATIC void
583  struct inode *inode,
584  struct buffer_head *bh,
585  xfs_off_t offset,
586  unsigned int type,
588  int need_ioend)
589 {
590  xfs_ioend_t *ioend = *result;
591 
592  if (!ioend || need_ioend || type != ioend->io_type) {
593  xfs_ioend_t *previous = *result;
594 
595  ioend = xfs_alloc_ioend(inode, type);
596  ioend->io_offset = offset;
597  ioend->io_buffer_head = bh;
598  ioend->io_buffer_tail = bh;
599  if (previous)
600  previous->io_list = ioend;
601  *result = ioend;
602  } else {
603  ioend->io_buffer_tail->b_private = bh;
604  ioend->io_buffer_tail = bh;
605  }
606 
607  bh->b_private = NULL;
608  ioend->io_size += bh->b_size;
609 }
610 
611 STATIC void
613  struct inode *inode,
614  struct buffer_head *bh,
615  struct xfs_bmbt_irec *imap,
616  xfs_off_t offset)
617 {
618  sector_t bn;
619  struct xfs_mount *m = XFS_I(inode)->i_mount;
620  xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
621  xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
622 
625 
626  bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
627  ((offset - iomap_offset) >> inode->i_blkbits);
628 
629  ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
630 
631  bh->b_blocknr = bn;
632  set_buffer_mapped(bh);
633 }
634 
635 STATIC void
637  struct inode *inode,
638  struct buffer_head *bh,
639  struct xfs_bmbt_irec *imap,
640  xfs_off_t offset)
641 {
644 
645  xfs_map_buffer(inode, bh, imap, offset);
646  set_buffer_mapped(bh);
647  clear_buffer_delay(bh);
648  clear_buffer_unwritten(bh);
649 }
650 
651 /*
652  * Test if a given page is suitable for writing as part of an unwritten
653  * or delayed allocate extent.
654  */
655 STATIC int
657  struct page *page,
658  unsigned int type)
659 {
660  if (PageWriteback(page))
661  return 0;
662 
663  if (page->mapping && page_has_buffers(page)) {
664  struct buffer_head *bh, *head;
665  int acceptable = 0;
666 
667  bh = head = page_buffers(page);
668  do {
669  if (buffer_unwritten(bh))
670  acceptable += (type == XFS_IO_UNWRITTEN);
671  else if (buffer_delay(bh))
672  acceptable += (type == XFS_IO_DELALLOC);
673  else if (buffer_dirty(bh) && buffer_mapped(bh))
674  acceptable += (type == XFS_IO_OVERWRITE);
675  else
676  break;
677  } while ((bh = bh->b_this_page) != head);
678 
679  if (acceptable)
680  return 1;
681  }
682 
683  return 0;
684 }
685 
686 /*
687  * Allocate & map buffers for page given the extent map. Write it out.
688  * except for the original page of a writepage, this is called on
689  * delalloc/unwritten pages only, for the original page it is possible
690  * that the page has no mapping at all.
691  */
692 STATIC int
694  struct inode *inode,
695  struct page *page,
696  loff_t tindex,
697  struct xfs_bmbt_irec *imap,
698  xfs_ioend_t **ioendp,
699  struct writeback_control *wbc)
700 {
701  struct buffer_head *bh, *head;
702  xfs_off_t end_offset;
703  unsigned long p_offset;
704  unsigned int type;
705  int len, page_dirty;
706  int count = 0, done = 0, uptodate = 1;
707  xfs_off_t offset = page_offset(page);
708 
709  if (page->index != tindex)
710  goto fail;
711  if (!trylock_page(page))
712  goto fail;
713  if (PageWriteback(page))
714  goto fail_unlock_page;
715  if (page->mapping != inode->i_mapping)
716  goto fail_unlock_page;
717  if (!xfs_check_page_type(page, (*ioendp)->io_type))
718  goto fail_unlock_page;
719 
720  /*
721  * page_dirty is initially a count of buffers on the page before
722  * EOF and is decremented as we move each into a cleanable state.
723  *
724  * Derivation:
725  *
726  * End offset is the highest offset that this page should represent.
727  * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
728  * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
729  * hence give us the correct page_dirty count. On any other page,
730  * it will be zero and in that case we need page_dirty to be the
731  * count of buffers on the page.
732  */
733  end_offset = min_t(unsigned long long,
734  (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
735  i_size_read(inode));
736 
737  len = 1 << inode->i_blkbits;
738  p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
740  p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
741  page_dirty = p_offset / len;
742 
743  bh = head = page_buffers(page);
744  do {
745  if (offset >= end_offset)
746  break;
747  if (!buffer_uptodate(bh))
748  uptodate = 0;
749  if (!(PageUptodate(page) || buffer_uptodate(bh))) {
750  done = 1;
751  continue;
752  }
753 
754  if (buffer_unwritten(bh) || buffer_delay(bh) ||
755  buffer_mapped(bh)) {
756  if (buffer_unwritten(bh))
757  type = XFS_IO_UNWRITTEN;
758  else if (buffer_delay(bh))
759  type = XFS_IO_DELALLOC;
760  else
761  type = XFS_IO_OVERWRITE;
762 
763  if (!xfs_imap_valid(inode, imap, offset)) {
764  done = 1;
765  continue;
766  }
767 
768  lock_buffer(bh);
769  if (type != XFS_IO_OVERWRITE)
770  xfs_map_at_offset(inode, bh, imap, offset);
771  xfs_add_to_ioend(inode, bh, offset, type,
772  ioendp, done);
773 
774  page_dirty--;
775  count++;
776  } else {
777  done = 1;
778  }
779  } while (offset += len, (bh = bh->b_this_page) != head);
780 
781  if (uptodate && bh == head)
782  SetPageUptodate(page);
783 
784  if (count) {
785  if (--wbc->nr_to_write <= 0 &&
786  wbc->sync_mode == WB_SYNC_NONE)
787  done = 1;
788  }
789  xfs_start_page_writeback(page, !page_dirty, count);
790 
791  return done;
792  fail_unlock_page:
793  unlock_page(page);
794  fail:
795  return 1;
796 }
797 
798 /*
799  * Convert & write out a cluster of pages in the same extent as defined
800  * by mp and following the start page.
801  */
802 STATIC void
804  struct inode *inode,
805  pgoff_t tindex,
806  struct xfs_bmbt_irec *imap,
807  xfs_ioend_t **ioendp,
808  struct writeback_control *wbc,
809  pgoff_t tlast)
810 {
811  struct pagevec pvec;
812  int done = 0, i;
813 
814  pagevec_init(&pvec, 0);
815  while (!done && tindex <= tlast) {
816  unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
817 
818  if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
819  break;
820 
821  for (i = 0; i < pagevec_count(&pvec); i++) {
822  done = xfs_convert_page(inode, pvec.pages[i], tindex++,
823  imap, ioendp, wbc);
824  if (done)
825  break;
826  }
827 
828  pagevec_release(&pvec);
829  cond_resched();
830  }
831 }
832 
833 STATIC void
835  struct page *page,
836  unsigned long offset)
837 {
838  trace_xfs_invalidatepage(page->mapping->host, page, offset);
839  block_invalidatepage(page, offset);
840 }
841 
842 /*
843  * If the page has delalloc buffers on it, we need to punch them out before we
844  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
845  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
846  * is done on that same region - the delalloc extent is returned when none is
847  * supposed to be there.
848  *
849  * We prevent this by truncating away the delalloc regions on the page before
850  * invalidating it. Because they are delalloc, we can do this without needing a
851  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
852  * truncation without a transaction as there is no space left for block
853  * reservation (typically why we see a ENOSPC in writeback).
854  *
855  * This is not a performance critical path, so for now just do the punching a
856  * buffer head at a time.
857  */
858 STATIC void
860  struct page *page)
861 {
862  struct inode *inode = page->mapping->host;
863  struct xfs_inode *ip = XFS_I(inode);
864  struct buffer_head *bh, *head;
865  loff_t offset = page_offset(page);
866 
868  goto out_invalidate;
869 
870  if (XFS_FORCED_SHUTDOWN(ip->i_mount))
871  goto out_invalidate;
872 
873  xfs_alert(ip->i_mount,
874  "page discard on page %p, inode 0x%llx, offset %llu.",
875  page, ip->i_ino, offset);
876 
877  xfs_ilock(ip, XFS_ILOCK_EXCL);
878  bh = head = page_buffers(page);
879  do {
880  int error;
881  xfs_fileoff_t start_fsb;
882 
883  if (!buffer_delay(bh))
884  goto next_buffer;
885 
886  start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
887  error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
888  if (error) {
889  /* something screwed, just bail */
890  if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
891  xfs_alert(ip->i_mount,
892  "page discard unable to remove delalloc mapping.");
893  }
894  break;
895  }
896 next_buffer:
897  offset += 1 << inode->i_blkbits;
898 
899  } while ((bh = bh->b_this_page) != head);
900 
901  xfs_iunlock(ip, XFS_ILOCK_EXCL);
902 out_invalidate:
903  xfs_vm_invalidatepage(page, 0);
904  return;
905 }
906 
907 /*
908  * Write out a dirty page.
909  *
910  * For delalloc space on the page we need to allocate space and flush it.
911  * For unwritten space on the page we need to start the conversion to
912  * regular allocated space.
913  * For any other dirty buffer heads on the page we should flush them.
914  */
915 STATIC int
917  struct page *page,
918  struct writeback_control *wbc)
919 {
920  struct inode *inode = page->mapping->host;
921  struct buffer_head *bh, *head;
922  struct xfs_bmbt_irec imap;
923  xfs_ioend_t *ioend = NULL, *iohead = NULL;
924  loff_t offset;
925  unsigned int type;
926  __uint64_t end_offset;
927  pgoff_t end_index, last_index;
928  ssize_t len;
929  int err, imap_valid = 0, uptodate = 1;
930  int count = 0;
931  int nonblocking = 0;
932 
933  trace_xfs_writepage(inode, page, 0);
934 
935  ASSERT(page_has_buffers(page));
936 
937  /*
938  * Refuse to write the page out if we are called from reclaim context.
939  *
940  * This avoids stack overflows when called from deeply used stacks in
941  * random callers for direct reclaim or memcg reclaim. We explicitly
942  * allow reclaim from kswapd as the stack usage there is relatively low.
943  *
944  * This should never happen except in the case of a VM regression so
945  * warn about it.
946  */
947  if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
948  PF_MEMALLOC))
949  goto redirty;
950 
951  /*
952  * Given that we do not allow direct reclaim to call us, we should
953  * never be called while in a filesystem transaction.
954  */
955  if (WARN_ON(current->flags & PF_FSTRANS))
956  goto redirty;
957 
958  /* Is this page beyond the end of the file? */
959  offset = i_size_read(inode);
960  end_index = offset >> PAGE_CACHE_SHIFT;
961  last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
962  if (page->index >= end_index) {
963  unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
964 
965  /*
966  * Just skip the page if it is fully outside i_size, e.g. due
967  * to a truncate operation that is in progress.
968  */
969  if (page->index >= end_index + 1 || offset_into_page == 0) {
970  unlock_page(page);
971  return 0;
972  }
973 
974  /*
975  * The page straddles i_size. It must be zeroed out on each
976  * and every writepage invocation because it may be mmapped.
977  * "A file is mapped in multiples of the page size. For a file
978  * that is not a multiple of the page size, the remaining
979  * memory is zeroed when mapped, and writes to that region are
980  * not written out to the file."
981  */
982  zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
983  }
984 
985  end_offset = min_t(unsigned long long,
986  (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
987  offset);
988  len = 1 << inode->i_blkbits;
989 
990  bh = head = page_buffers(page);
991  offset = page_offset(page);
992  type = XFS_IO_OVERWRITE;
993 
994  if (wbc->sync_mode == WB_SYNC_NONE)
995  nonblocking = 1;
996 
997  do {
998  int new_ioend = 0;
999 
1000  if (offset >= end_offset)
1001  break;
1002  if (!buffer_uptodate(bh))
1003  uptodate = 0;
1004 
1005  /*
1006  * set_page_dirty dirties all buffers in a page, independent
1007  * of their state. The dirty state however is entirely
1008  * meaningless for holes (!mapped && uptodate), so skip
1009  * buffers covering holes here.
1010  */
1011  if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1012  imap_valid = 0;
1013  continue;
1014  }
1015 
1016  if (buffer_unwritten(bh)) {
1017  if (type != XFS_IO_UNWRITTEN) {
1018  type = XFS_IO_UNWRITTEN;
1019  imap_valid = 0;
1020  }
1021  } else if (buffer_delay(bh)) {
1022  if (type != XFS_IO_DELALLOC) {
1023  type = XFS_IO_DELALLOC;
1024  imap_valid = 0;
1025  }
1026  } else if (buffer_uptodate(bh)) {
1027  if (type != XFS_IO_OVERWRITE) {
1028  type = XFS_IO_OVERWRITE;
1029  imap_valid = 0;
1030  }
1031  } else {
1032  if (PageUptodate(page))
1033  ASSERT(buffer_mapped(bh));
1034  /*
1035  * This buffer is not uptodate and will not be
1036  * written to disk. Ensure that we will put any
1037  * subsequent writeable buffers into a new
1038  * ioend.
1039  */
1040  imap_valid = 0;
1041  continue;
1042  }
1043 
1044  if (imap_valid)
1045  imap_valid = xfs_imap_valid(inode, &imap, offset);
1046  if (!imap_valid) {
1047  /*
1048  * If we didn't have a valid mapping then we need to
1049  * put the new mapping into a separate ioend structure.
1050  * This ensures non-contiguous extents always have
1051  * separate ioends, which is particularly important
1052  * for unwritten extent conversion at I/O completion
1053  * time.
1054  */
1055  new_ioend = 1;
1056  err = xfs_map_blocks(inode, offset, &imap, type,
1057  nonblocking);
1058  if (err)
1059  goto error;
1060  imap_valid = xfs_imap_valid(inode, &imap, offset);
1061  }
1062  if (imap_valid) {
1063  lock_buffer(bh);
1064  if (type != XFS_IO_OVERWRITE)
1065  xfs_map_at_offset(inode, bh, &imap, offset);
1066  xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1067  new_ioend);
1068  count++;
1069  }
1070 
1071  if (!iohead)
1072  iohead = ioend;
1073 
1074  } while (offset += len, ((bh = bh->b_this_page) != head));
1075 
1076  if (uptodate && bh == head)
1077  SetPageUptodate(page);
1078 
1079  xfs_start_page_writeback(page, 1, count);
1080 
1081  /* if there is no IO to be submitted for this page, we are done */
1082  if (!ioend)
1083  return 0;
1084 
1085  ASSERT(iohead);
1086 
1087  /*
1088  * Any errors from this point onwards need tobe reported through the IO
1089  * completion path as we have marked the initial page as under writeback
1090  * and unlocked it.
1091  */
1092  if (imap_valid) {
1093  xfs_off_t end_index;
1094 
1095  end_index = imap.br_startoff + imap.br_blockcount;
1096 
1097  /* to bytes */
1098  end_index <<= inode->i_blkbits;
1099 
1100  /* to pages */
1101  end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1102 
1103  /* check against file size */
1104  if (end_index > last_index)
1105  end_index = last_index;
1106 
1107  xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1108  wbc, end_index);
1109  }
1110 
1111 
1112  /*
1113  * Reserve log space if we might write beyond the on-disk inode size.
1114  */
1115  err = 0;
1116  if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1117  err = xfs_setfilesize_trans_alloc(ioend);
1118 
1119  xfs_submit_ioend(wbc, iohead, err);
1120 
1121  return 0;
1122 
1123 error:
1124  if (iohead)
1125  xfs_cancel_ioend(iohead);
1126 
1127  if (err == -EAGAIN)
1128  goto redirty;
1129 
1130  xfs_aops_discard_page(page);
1131  ClearPageUptodate(page);
1132  unlock_page(page);
1133  return err;
1134 
1135 redirty:
1136  redirty_page_for_writepage(wbc, page);
1137  unlock_page(page);
1138  return 0;
1139 }
1140 
1141 STATIC int
1143  struct address_space *mapping,
1144  struct writeback_control *wbc)
1145 {
1146  xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1147  return generic_writepages(mapping, wbc);
1148 }
1149 
1150 /*
1151  * Called to move a page into cleanable state - and from there
1152  * to be released. The page should already be clean. We always
1153  * have buffer heads in this call.
1154  *
1155  * Returns 1 if the page is ok to release, 0 otherwise.
1156  */
1157 STATIC int
1159  struct page *page,
1160  gfp_t gfp_mask)
1161 {
1162  int delalloc, unwritten;
1163 
1164  trace_xfs_releasepage(page->mapping->host, page, 0);
1165 
1166  xfs_count_page_state(page, &delalloc, &unwritten);
1167 
1168  if (WARN_ON(delalloc))
1169  return 0;
1170  if (WARN_ON(unwritten))
1171  return 0;
1172 
1173  return try_to_free_buffers(page);
1174 }
1175 
1176 STATIC int
1178  struct inode *inode,
1179  sector_t iblock,
1180  struct buffer_head *bh_result,
1181  int create,
1182  int direct)
1183 {
1184  struct xfs_inode *ip = XFS_I(inode);
1185  struct xfs_mount *mp = ip->i_mount;
1186  xfs_fileoff_t offset_fsb, end_fsb;
1187  int error = 0;
1188  int lockmode = 0;
1189  struct xfs_bmbt_irec imap;
1190  int nimaps = 1;
1191  xfs_off_t offset;
1192  ssize_t size;
1193  int new = 0;
1194 
1195  if (XFS_FORCED_SHUTDOWN(mp))
1196  return -XFS_ERROR(EIO);
1197 
1198  offset = (xfs_off_t)iblock << inode->i_blkbits;
1199  ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1200  size = bh_result->b_size;
1201 
1202  if (!create && direct && offset >= i_size_read(inode))
1203  return 0;
1204 
1205  /*
1206  * Direct I/O is usually done on preallocated files, so try getting
1207  * a block mapping without an exclusive lock first. For buffered
1208  * writes we already have the exclusive iolock anyway, so avoiding
1209  * a lock roundtrip here by taking the ilock exclusive from the
1210  * beginning is a useful micro optimization.
1211  */
1212  if (create && !direct) {
1213  lockmode = XFS_ILOCK_EXCL;
1214  xfs_ilock(ip, lockmode);
1215  } else {
1216  lockmode = xfs_ilock_map_shared(ip);
1217  }
1218 
1219  ASSERT(offset <= mp->m_super->s_maxbytes);
1220  if (offset + size > mp->m_super->s_maxbytes)
1221  size = mp->m_super->s_maxbytes - offset;
1222  end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1223  offset_fsb = XFS_B_TO_FSBT(mp, offset);
1224 
1225  error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1226  &imap, &nimaps, XFS_BMAPI_ENTIRE);
1227  if (error)
1228  goto out_unlock;
1229 
1230  if (create &&
1231  (!nimaps ||
1232  (imap.br_startblock == HOLESTARTBLOCK ||
1233  imap.br_startblock == DELAYSTARTBLOCK))) {
1234  if (direct || xfs_get_extsz_hint(ip)) {
1235  /*
1236  * Drop the ilock in preparation for starting the block
1237  * allocation transaction. It will be retaken
1238  * exclusively inside xfs_iomap_write_direct for the
1239  * actual allocation.
1240  */
1241  xfs_iunlock(ip, lockmode);
1242  error = xfs_iomap_write_direct(ip, offset, size,
1243  &imap, nimaps);
1244  if (error)
1245  return -error;
1246  new = 1;
1247  } else {
1248  /*
1249  * Delalloc reservations do not require a transaction,
1250  * we can go on without dropping the lock here. If we
1251  * are allocating a new delalloc block, make sure that
1252  * we set the new flag so that we mark the buffer new so
1253  * that we know that it is newly allocated if the write
1254  * fails.
1255  */
1256  if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1257  new = 1;
1258  error = xfs_iomap_write_delay(ip, offset, size, &imap);
1259  if (error)
1260  goto out_unlock;
1261 
1262  xfs_iunlock(ip, lockmode);
1263  }
1264 
1265  trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1266  } else if (nimaps) {
1267  trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1268  xfs_iunlock(ip, lockmode);
1269  } else {
1270  trace_xfs_get_blocks_notfound(ip, offset, size);
1271  goto out_unlock;
1272  }
1273 
1274  if (imap.br_startblock != HOLESTARTBLOCK &&
1275  imap.br_startblock != DELAYSTARTBLOCK) {
1276  /*
1277  * For unwritten extents do not report a disk address on
1278  * the read case (treat as if we're reading into a hole).
1279  */
1280  if (create || !ISUNWRITTEN(&imap))
1281  xfs_map_buffer(inode, bh_result, &imap, offset);
1282  if (create && ISUNWRITTEN(&imap)) {
1283  if (direct)
1284  bh_result->b_private = inode;
1285  set_buffer_unwritten(bh_result);
1286  }
1287  }
1288 
1289  /*
1290  * If this is a realtime file, data may be on a different device.
1291  * to that pointed to from the buffer_head b_bdev currently.
1292  */
1293  bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1294 
1295  /*
1296  * If we previously allocated a block out beyond eof and we are now
1297  * coming back to use it then we will need to flag it as new even if it
1298  * has a disk address.
1299  *
1300  * With sub-block writes into unwritten extents we also need to mark
1301  * the buffer as new so that the unwritten parts of the buffer gets
1302  * correctly zeroed.
1303  */
1304  if (create &&
1305  ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1306  (offset >= i_size_read(inode)) ||
1307  (new || ISUNWRITTEN(&imap))))
1308  set_buffer_new(bh_result);
1309 
1310  if (imap.br_startblock == DELAYSTARTBLOCK) {
1311  BUG_ON(direct);
1312  if (create) {
1313  set_buffer_uptodate(bh_result);
1314  set_buffer_mapped(bh_result);
1315  set_buffer_delay(bh_result);
1316  }
1317  }
1318 
1319  /*
1320  * If this is O_DIRECT or the mpage code calling tell them how large
1321  * the mapping is, so that we can avoid repeated get_blocks calls.
1322  */
1323  if (direct || size > (1 << inode->i_blkbits)) {
1324  xfs_off_t mapping_size;
1325 
1326  mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1327  mapping_size <<= inode->i_blkbits;
1328 
1329  ASSERT(mapping_size > 0);
1330  if (mapping_size > size)
1331  mapping_size = size;
1332  if (mapping_size > LONG_MAX)
1333  mapping_size = LONG_MAX;
1334 
1335  bh_result->b_size = mapping_size;
1336  }
1337 
1338  return 0;
1339 
1340 out_unlock:
1341  xfs_iunlock(ip, lockmode);
1342  return -error;
1343 }
1344 
1345 int
1347  struct inode *inode,
1348  sector_t iblock,
1349  struct buffer_head *bh_result,
1350  int create)
1351 {
1352  return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
1353 }
1354 
1355 STATIC int
1357  struct inode *inode,
1358  sector_t iblock,
1359  struct buffer_head *bh_result,
1360  int create)
1361 {
1362  return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1363 }
1364 
1365 /*
1366  * Complete a direct I/O write request.
1367  *
1368  * If the private argument is non-NULL __xfs_get_blocks signals us that we
1369  * need to issue a transaction to convert the range from unwritten to written
1370  * extents. In case this is regular synchronous I/O we just call xfs_end_io
1371  * to do this and we are done. But in case this was a successful AIO
1372  * request this handler is called from interrupt context, from which we
1373  * can't start transactions. In that case offload the I/O completion to
1374  * the workqueues we also use for buffered I/O completion.
1375  */
1376 STATIC void
1378  struct kiocb *iocb,
1379  loff_t offset,
1380  ssize_t size,
1381  void *private,
1382  int ret,
1383  bool is_async)
1384 {
1385  struct xfs_ioend *ioend = iocb->private;
1386 
1387  /*
1388  * While the generic direct I/O code updates the inode size, it does
1389  * so only after the end_io handler is called, which means our
1390  * end_io handler thinks the on-disk size is outside the in-core
1391  * size. To prevent this just update it a little bit earlier here.
1392  */
1393  if (offset + size > i_size_read(ioend->io_inode))
1394  i_size_write(ioend->io_inode, offset + size);
1395 
1396  /*
1397  * blockdev_direct_IO can return an error even after the I/O
1398  * completion handler was called. Thus we need to protect
1399  * against double-freeing.
1400  */
1401  iocb->private = NULL;
1402 
1403  ioend->io_offset = offset;
1404  ioend->io_size = size;
1405  ioend->io_iocb = iocb;
1406  ioend->io_result = ret;
1407  if (private && size > 0)
1408  ioend->io_type = XFS_IO_UNWRITTEN;
1409 
1410  if (is_async) {
1411  ioend->io_isasync = 1;
1412  xfs_finish_ioend(ioend);
1413  } else {
1414  xfs_finish_ioend_sync(ioend);
1415  }
1416 }
1417 
1420  int rw,
1421  struct kiocb *iocb,
1422  const struct iovec *iov,
1423  loff_t offset,
1424  unsigned long nr_segs)
1425 {
1426  struct inode *inode = iocb->ki_filp->f_mapping->host;
1427  struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1428  struct xfs_ioend *ioend = NULL;
1429  ssize_t ret;
1430 
1431  if (rw & WRITE) {
1432  size_t size = iov_length(iov, nr_segs);
1433 
1434  /*
1435  * We need to preallocate a transaction for a size update
1436  * here. In the case that this write both updates the size
1437  * and converts at least on unwritten extent we will cancel
1438  * the still clean transaction after the I/O has finished.
1439  */
1440  iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1441  if (offset + size > XFS_I(inode)->i_d.di_size) {
1442  ret = xfs_setfilesize_trans_alloc(ioend);
1443  if (ret)
1444  goto out_destroy_ioend;
1445  ioend->io_isdirect = 1;
1446  }
1447 
1448  ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1449  offset, nr_segs,
1452  if (ret != -EIOCBQUEUED && iocb->private)
1453  goto out_trans_cancel;
1454  } else {
1455  ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1456  offset, nr_segs,
1458  NULL, NULL, 0);
1459  }
1460 
1461  return ret;
1462 
1463 out_trans_cancel:
1464  if (ioend->io_append_trans) {
1465  current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1466  PF_FSTRANS);
1468  &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1469  0, 1, _THIS_IP_);
1470  xfs_trans_cancel(ioend->io_append_trans, 0);
1471  }
1472 out_destroy_ioend:
1473  xfs_destroy_ioend(ioend);
1474  return ret;
1475 }
1476 
1477 /*
1478  * Punch out the delalloc blocks we have already allocated.
1479  *
1480  * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1481  * as the page is still locked at this point.
1482  */
1483 STATIC void
1485  struct inode *inode,
1486  loff_t start,
1487  loff_t end)
1488 {
1489  struct xfs_inode *ip = XFS_I(inode);
1490  xfs_fileoff_t start_fsb;
1491  xfs_fileoff_t end_fsb;
1492  int error;
1493 
1494  start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1495  end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1496  if (end_fsb <= start_fsb)
1497  return;
1498 
1499  xfs_ilock(ip, XFS_ILOCK_EXCL);
1500  error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1501  end_fsb - start_fsb);
1502  if (error) {
1503  /* something screwed, just bail */
1504  if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1505  xfs_alert(ip->i_mount,
1506  "xfs_vm_write_failed: unable to clean up ino %lld",
1507  ip->i_ino);
1508  }
1509  }
1510  xfs_iunlock(ip, XFS_ILOCK_EXCL);
1511 }
1512 
1513 STATIC void
1515  struct inode *inode,
1516  struct page *page,
1517  loff_t pos,
1518  unsigned len)
1519 {
1520  loff_t block_offset = pos & PAGE_MASK;
1521  loff_t block_start;
1522  loff_t block_end;
1523  loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1524  loff_t to = from + len;
1525  struct buffer_head *bh, *head;
1526 
1527  ASSERT(block_offset + from == pos);
1528 
1529  head = page_buffers(page);
1530  block_start = 0;
1531  for (bh = head; bh != head || !block_start;
1532  bh = bh->b_this_page, block_start = block_end,
1533  block_offset += bh->b_size) {
1534  block_end = block_start + bh->b_size;
1535 
1536  /* skip buffers before the write */
1537  if (block_end <= from)
1538  continue;
1539 
1540  /* if the buffer is after the write, we're done */
1541  if (block_start >= to)
1542  break;
1543 
1544  if (!buffer_delay(bh))
1545  continue;
1546 
1547  if (!buffer_new(bh) && block_offset < i_size_read(inode))
1548  continue;
1549 
1550  xfs_vm_kill_delalloc_range(inode, block_offset,
1551  block_offset + bh->b_size);
1552  }
1553 
1554 }
1555 
1556 /*
1557  * This used to call block_write_begin(), but it unlocks and releases the page
1558  * on error, and we need that page to be able to punch stale delalloc blocks out
1559  * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1560  * the appropriate point.
1561  */
1562 STATIC int
1564  struct file *file,
1565  struct address_space *mapping,
1566  loff_t pos,
1567  unsigned len,
1568  unsigned flags,
1569  struct page **pagep,
1570  void **fsdata)
1571 {
1572  pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1573  struct page *page;
1574  int status;
1575 
1576  ASSERT(len <= PAGE_CACHE_SIZE);
1577 
1578  page = grab_cache_page_write_begin(mapping, index,
1579  flags | AOP_FLAG_NOFS);
1580  if (!page)
1581  return -ENOMEM;
1582 
1583  status = __block_write_begin(page, pos, len, xfs_get_blocks);
1584  if (unlikely(status)) {
1585  struct inode *inode = mapping->host;
1586 
1587  xfs_vm_write_failed(inode, page, pos, len);
1588  unlock_page(page);
1589 
1590  if (pos + len > i_size_read(inode))
1591  truncate_pagecache(inode, pos + len, i_size_read(inode));
1592 
1593  page_cache_release(page);
1594  page = NULL;
1595  }
1596 
1597  *pagep = page;
1598  return status;
1599 }
1600 
1601 /*
1602  * On failure, we only need to kill delalloc blocks beyond EOF because they
1603  * will never be written. For blocks within EOF, generic_write_end() zeros them
1604  * so they are safe to leave alone and be written with all the other valid data.
1605  */
1606 STATIC int
1608  struct file *file,
1609  struct address_space *mapping,
1610  loff_t pos,
1611  unsigned len,
1612  unsigned copied,
1613  struct page *page,
1614  void *fsdata)
1615 {
1616  int ret;
1617 
1618  ASSERT(len <= PAGE_CACHE_SIZE);
1619 
1620  ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1621  if (unlikely(ret < len)) {
1622  struct inode *inode = mapping->host;
1623  size_t isize = i_size_read(inode);
1624  loff_t to = pos + len;
1625 
1626  if (to > isize) {
1627  truncate_pagecache(inode, to, isize);
1628  xfs_vm_kill_delalloc_range(inode, isize, to);
1629  }
1630  }
1631  return ret;
1632 }
1633 
1636  struct address_space *mapping,
1637  sector_t block)
1638 {
1639  struct inode *inode = (struct inode *)mapping->host;
1640  struct xfs_inode *ip = XFS_I(inode);
1641 
1642  trace_xfs_vm_bmap(XFS_I(inode));
1643  xfs_ilock(ip, XFS_IOLOCK_SHARED);
1644  xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1645  xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1646  return generic_block_bmap(mapping, block, xfs_get_blocks);
1647 }
1648 
1649 STATIC int
1651  struct file *unused,
1652  struct page *page)
1653 {
1654  return mpage_readpage(page, xfs_get_blocks);
1655 }
1656 
1657 STATIC int
1659  struct file *unused,
1660  struct address_space *mapping,
1661  struct list_head *pages,
1662  unsigned nr_pages)
1663 {
1664  return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1665 }
1666 
1668  .readpage = xfs_vm_readpage,
1669  .readpages = xfs_vm_readpages,
1670  .writepage = xfs_vm_writepage,
1671  .writepages = xfs_vm_writepages,
1672  .releasepage = xfs_vm_releasepage,
1673  .invalidatepage = xfs_vm_invalidatepage,
1674  .write_begin = xfs_vm_write_begin,
1675  .write_end = xfs_vm_write_end,
1676  .bmap = xfs_vm_bmap,
1677  .direct_IO = xfs_vm_direct_IO,
1678  .migratepage = buffer_migrate_page,
1679  .is_partially_uptodate = block_is_partially_uptodate,
1680  .error_remove_page = generic_error_remove_page,
1681 };