Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
filemap.c
Go to the documentation of this file.
1 /*
2  * linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-1999 Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/export.h>
13 #include <linux/compiler.h>
14 #include <linux/fs.h>
15 #include <linux/uaccess.h>
16 #include <linux/aio.h>
17 #include <linux/capability.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/gfp.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/mman.h>
23 #include <linux/pagemap.h>
24 #include <linux/file.h>
25 #include <linux/uio.h>
26 #include <linux/hash.h>
27 #include <linux/writeback.h>
28 #include <linux/backing-dev.h>
29 #include <linux/pagevec.h>
30 #include <linux/blkdev.h>
31 #include <linux/security.h>
32 #include <linux/cpuset.h>
33 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34 #include <linux/memcontrol.h>
35 #include <linux/cleancache.h>
36 #include "internal.h"
37 
38 /*
39  * FIXME: remove all knowledge of the buffer layer from the core VM
40  */
41 #include <linux/buffer_head.h> /* for try_to_free_buffers */
42 
43 #include <asm/mman.h>
44 
45 /*
46  * Shared mappings implemented 30.11.1994. It's not fully working yet,
47  * though.
48  *
49  * Shared mappings now work. 15.8.1995 Bruno.
50  *
51  * finished 'unifying' the page and buffer cache and SMP-threaded the
52  * page-cache, 21.05.1999, Ingo Molnar <[email protected]>
53  *
54  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <[email protected]>
55  */
56 
57 /*
58  * Lock ordering:
59  *
60  * ->i_mmap_mutex (truncate_pagecache)
61  * ->private_lock (__free_pte->__set_page_dirty_buffers)
62  * ->swap_lock (exclusive_swap_page, others)
63  * ->mapping->tree_lock
64  *
65  * ->i_mutex
66  * ->i_mmap_mutex (truncate->unmap_mapping_range)
67  *
68  * ->mmap_sem
69  * ->i_mmap_mutex
70  * ->page_table_lock or pte_lock (various, mainly in memory.c)
71  * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
72  *
73  * ->mmap_sem
74  * ->lock_page (access_process_vm)
75  *
76  * ->i_mutex (generic_file_buffered_write)
77  * ->mmap_sem (fault_in_pages_readable->do_page_fault)
78  *
79  * bdi->wb.list_lock
80  * sb_lock (fs/fs-writeback.c)
81  * ->mapping->tree_lock (__sync_single_inode)
82  *
83  * ->i_mmap_mutex
84  * ->anon_vma.lock (vma_adjust)
85  *
86  * ->anon_vma.lock
87  * ->page_table_lock or pte_lock (anon_vma_prepare and various)
88  *
89  * ->page_table_lock or pte_lock
90  * ->swap_lock (try_to_unmap_one)
91  * ->private_lock (try_to_unmap_one)
92  * ->tree_lock (try_to_unmap_one)
93  * ->zone.lru_lock (follow_page->mark_page_accessed)
94  * ->zone.lru_lock (check_pte_range->isolate_lru_page)
95  * ->private_lock (page_remove_rmap->set_page_dirty)
96  * ->tree_lock (page_remove_rmap->set_page_dirty)
97  * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
98  * ->inode->i_lock (page_remove_rmap->set_page_dirty)
99  * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
100  * ->inode->i_lock (zap_pte_range->set_page_dirty)
101  * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
102  *
103  * ->i_mmap_mutex
104  * ->tasklist_lock (memory_failure, collect_procs_ao)
105  */
106 
107 /*
108  * Delete a page from the page cache and free it. Caller has to make
109  * sure the page is locked and that nobody else uses it - or that usage
110  * is safe. The caller must hold the mapping's tree_lock.
111  */
113 {
114  struct address_space *mapping = page->mapping;
115 
116  /*
117  * if we're uptodate, flush out into the cleancache, otherwise
118  * invalidate any existing cleancache entries. We can't leave
119  * stale data around in the cleancache once our page is gone
120  */
121  if (PageUptodate(page) && PageMappedToDisk(page))
122  cleancache_put_page(page);
123  else
124  cleancache_invalidate_page(mapping, page);
125 
126  radix_tree_delete(&mapping->page_tree, page->index);
127  page->mapping = NULL;
128  /* Leave page->index set: truncation lookup relies upon it */
129  mapping->nrpages--;
130  __dec_zone_page_state(page, NR_FILE_PAGES);
131  if (PageSwapBacked(page))
132  __dec_zone_page_state(page, NR_SHMEM);
133  BUG_ON(page_mapped(page));
134 
135  /*
136  * Some filesystems seem to re-dirty the page even after
137  * the VM has canceled the dirty bit (eg ext3 journaling).
138  *
139  * Fix it up by doing a final dirty accounting check after
140  * having removed the page entirely.
141  */
142  if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
144  dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
145  }
146 }
147 
157 {
158  struct address_space *mapping = page->mapping;
159  void (*freepage)(struct page *);
160 
161  BUG_ON(!PageLocked(page));
162 
163  freepage = mapping->a_ops->freepage;
164  spin_lock_irq(&mapping->tree_lock);
166  spin_unlock_irq(&mapping->tree_lock);
168 
169  if (freepage)
170  freepage(page);
171  page_cache_release(page);
172 }
174 
175 static int sleep_on_page(void *word)
176 {
177  io_schedule();
178  return 0;
179 }
180 
181 static int sleep_on_page_killable(void *word)
182 {
183  sleep_on_page(word);
184  return fatal_signal_pending(current) ? -EINTR : 0;
185 }
186 
203  loff_t end, int sync_mode)
204 {
205  int ret;
206  struct writeback_control wbc = {
207  .sync_mode = sync_mode,
208  .nr_to_write = LONG_MAX,
209  .range_start = start,
210  .range_end = end,
211  };
212 
213  if (!mapping_cap_writeback_dirty(mapping))
214  return 0;
215 
216  ret = do_writepages(mapping, &wbc);
217  return ret;
218 }
219 
220 static inline int __filemap_fdatawrite(struct address_space *mapping,
221  int sync_mode)
222 {
223  return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
224 }
225 
227 {
228  return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
229 }
231 
233  loff_t end)
234 {
235  return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
236 }
238 
247 {
248  return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
249 }
251 
261 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
262  loff_t end_byte)
263 {
264  pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
265  pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
266  struct pagevec pvec;
267  int nr_pages;
268  int ret = 0;
269 
270  if (end_byte < start_byte)
271  return 0;
272 
273  pagevec_init(&pvec, 0);
274  while ((index <= end) &&
275  (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
277  min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
278  unsigned i;
279 
280  for (i = 0; i < nr_pages; i++) {
281  struct page *page = pvec.pages[i];
282 
283  /* until radix tree lookup accepts end_index */
284  if (page->index > end)
285  continue;
286 
287  wait_on_page_writeback(page);
288  if (TestClearPageError(page))
289  ret = -EIO;
290  }
291  pagevec_release(&pvec);
292  cond_resched();
293  }
294 
295  /* Check for outstanding write errors */
296  if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
297  ret = -ENOSPC;
298  if (test_and_clear_bit(AS_EIO, &mapping->flags))
299  ret = -EIO;
300 
301  return ret;
302 }
304 
313 {
314  loff_t i_size = i_size_read(mapping->host);
315 
316  if (i_size == 0)
317  return 0;
318 
319  return filemap_fdatawait_range(mapping, 0, i_size - 1);
320 }
322 
324 {
325  int err = 0;
326 
327  if (mapping->nrpages) {
328  err = filemap_fdatawrite(mapping);
329  /*
330  * Even if the above returned error, the pages may be
331  * written partially (e.g. -ENOSPC), so we wait for it.
332  * But the -EIO is special case, it may indicate the worst
333  * thing (e.g. bug) happened, so we avoid waiting for it.
334  */
335  if (err != -EIO) {
336  int err2 = filemap_fdatawait(mapping);
337  if (!err)
338  err = err2;
339  }
340  }
341  return err;
342 }
344 
357  loff_t lstart, loff_t lend)
358 {
359  int err = 0;
360 
361  if (mapping->nrpages) {
362  err = __filemap_fdatawrite_range(mapping, lstart, lend,
363  WB_SYNC_ALL);
364  /* See comment of filemap_write_and_wait() */
365  if (err != -EIO) {
366  int err2 = filemap_fdatawait_range(mapping,
367  lstart, lend);
368  if (!err)
369  err = err2;
370  }
371  }
372  return err;
373 }
375 
391 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
392 {
393  int error;
394 
395  VM_BUG_ON(!PageLocked(old));
396  VM_BUG_ON(!PageLocked(new));
397  VM_BUG_ON(new->mapping);
398 
399  error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
400  if (!error) {
401  struct address_space *mapping = old->mapping;
402  void (*freepage)(struct page *);
403 
404  pgoff_t offset = old->index;
405  freepage = mapping->a_ops->freepage;
406 
407  page_cache_get(new);
408  new->mapping = mapping;
409  new->index = offset;
410 
411  spin_lock_irq(&mapping->tree_lock);
413  error = radix_tree_insert(&mapping->page_tree, offset, new);
414  BUG_ON(error);
415  mapping->nrpages++;
416  __inc_zone_page_state(new, NR_FILE_PAGES);
417  if (PageSwapBacked(new))
418  __inc_zone_page_state(new, NR_SHMEM);
419  spin_unlock_irq(&mapping->tree_lock);
420  /* mem_cgroup codes must not be called under tree_lock */
422  radix_tree_preload_end();
423  if (freepage)
424  freepage(old);
425  page_cache_release(old);
426  }
427 
428  return error;
429 }
431 
444 {
445  int error;
446 
447  VM_BUG_ON(!PageLocked(page));
448  VM_BUG_ON(PageSwapBacked(page));
449 
450  error = mem_cgroup_cache_charge(page, current->mm,
451  gfp_mask & GFP_RECLAIM_MASK);
452  if (error)
453  goto out;
454 
455  error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
456  if (error == 0) {
457  page_cache_get(page);
458  page->mapping = mapping;
459  page->index = offset;
460 
461  spin_lock_irq(&mapping->tree_lock);
462  error = radix_tree_insert(&mapping->page_tree, offset, page);
463  if (likely(!error)) {
464  mapping->nrpages++;
465  __inc_zone_page_state(page, NR_FILE_PAGES);
466  spin_unlock_irq(&mapping->tree_lock);
467  } else {
468  page->mapping = NULL;
469  /* Leave page->index set: truncation relies upon it */
470  spin_unlock_irq(&mapping->tree_lock);
472  page_cache_release(page);
473  }
474  radix_tree_preload_end();
475  } else
477 out:
478  return error;
479 }
481 
484 {
485  int ret;
486 
487  ret = add_to_page_cache(page, mapping, offset, gfp_mask);
488  if (ret == 0)
489  lru_cache_add_file(page);
490  return ret;
491 }
493 
494 #ifdef CONFIG_NUMA
495 struct page *__page_cache_alloc(gfp_t gfp)
496 {
497  int n;
498  struct page *page;
499 
500  if (cpuset_do_page_mem_spread()) {
501  unsigned int cpuset_mems_cookie;
502  do {
503  cpuset_mems_cookie = get_mems_allowed();
505  page = alloc_pages_exact_node(n, gfp, 0);
506  } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
507 
508  return page;
509  }
510  return alloc_pages(gfp, 0);
511 }
512 EXPORT_SYMBOL(__page_cache_alloc);
513 #endif
514 
515 /*
516  * In order to wait for pages to become available there must be
517  * waitqueues associated with pages. By using a hash table of
518  * waitqueues where the bucket discipline is to maintain all
519  * waiters on the same queue and wake all when any of the pages
520  * become available, and for the woken contexts to check to be
521  * sure the appropriate page became available, this saves space
522  * at a cost of "thundering herd" phenomena during rare hash
523  * collisions.
524  */
525 static wait_queue_head_t *page_waitqueue(struct page *page)
526 {
527  const struct zone *zone = page_zone(page);
528 
529  return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
530 }
531 
532 static inline void wake_up_page(struct page *page, int bit)
533 {
534  __wake_up_bit(page_waitqueue(page), &page->flags, bit);
535 }
536 
537 void wait_on_page_bit(struct page *page, int bit_nr)
538 {
539  DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
540 
541  if (test_bit(bit_nr, &page->flags))
542  __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
544 }
546 
547 int wait_on_page_bit_killable(struct page *page, int bit_nr)
548 {
549  DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
550 
551  if (!test_bit(bit_nr, &page->flags))
552  return 0;
553 
554  return __wait_on_bit(page_waitqueue(page), &wait,
555  sleep_on_page_killable, TASK_KILLABLE);
556 }
557 
565 void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
566 {
567  wait_queue_head_t *q = page_waitqueue(page);
568  unsigned long flags;
569 
570  spin_lock_irqsave(&q->lock, flags);
571  __add_wait_queue(q, waiter);
572  spin_unlock_irqrestore(&q->lock, flags);
573 }
575 
588 void unlock_page(struct page *page)
589 {
590  VM_BUG_ON(!PageLocked(page));
593  wake_up_page(page, PG_locked);
594 }
596 
601 void end_page_writeback(struct page *page)
602 {
603  if (TestClearPageReclaim(page))
605 
606  if (!test_clear_page_writeback(page))
607  BUG();
608 
610  wake_up_page(page, PG_writeback);
611 }
613 
618 void __lock_page(struct page *page)
619 {
621 
622  __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
624 }
626 
627 int __lock_page_killable(struct page *page)
628 {
630 
631  return __wait_on_bit_lock(page_waitqueue(page), &wait,
632  sleep_on_page_killable, TASK_KILLABLE);
633 }
635 
636 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
637  unsigned int flags)
638 {
639  if (flags & FAULT_FLAG_ALLOW_RETRY) {
640  /*
641  * CAUTION! In this case, mmap_sem is not released
642  * even though return 0.
643  */
644  if (flags & FAULT_FLAG_RETRY_NOWAIT)
645  return 0;
646 
647  up_read(&mm->mmap_sem);
648  if (flags & FAULT_FLAG_KILLABLE)
649  wait_on_page_locked_killable(page);
650  else
651  wait_on_page_locked(page);
652  return 0;
653  } else {
654  if (flags & FAULT_FLAG_KILLABLE) {
655  int ret;
656 
657  ret = __lock_page_killable(page);
658  if (ret) {
659  up_read(&mm->mmap_sem);
660  return 0;
661  }
662  } else
663  __lock_page(page);
664  return 1;
665  }
666 }
667 
677 {
678  void **pagep;
679  struct page *page;
680 
681  rcu_read_lock();
682 repeat:
683  page = NULL;
684  pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
685  if (pagep) {
686  page = radix_tree_deref_slot(pagep);
687  if (unlikely(!page))
688  goto out;
689  if (radix_tree_exception(page)) {
690  if (radix_tree_deref_retry(page))
691  goto repeat;
692  /*
693  * Otherwise, shmem/tmpfs must be storing a swap entry
694  * here as an exceptional entry: so return it without
695  * attempting to raise page count.
696  */
697  goto out;
698  }
699  if (!page_cache_get_speculative(page))
700  goto repeat;
701 
702  /*
703  * Has the page moved?
704  * This is part of the lockless pagecache protocol. See
705  * include/linux/pagemap.h for details.
706  */
707  if (unlikely(page != *pagep)) {
708  page_cache_release(page);
709  goto repeat;
710  }
711  }
712 out:
713  rcu_read_unlock();
714 
715  return page;
716 }
718 
730 {
731  struct page *page;
732 
733 repeat:
734  page = find_get_page(mapping, offset);
735  if (page && !radix_tree_exception(page)) {
736  lock_page(page);
737  /* Has the page been truncated? */
738  if (unlikely(page->mapping != mapping)) {
739  unlock_page(page);
740  page_cache_release(page);
741  goto repeat;
742  }
743  VM_BUG_ON(page->index != offset);
744  }
745  return page;
746 }
748 
768 {
769  struct page *page;
770  int err;
771 repeat:
772  page = find_lock_page(mapping, index);
773  if (!page) {
774  page = __page_cache_alloc(gfp_mask);
775  if (!page)
776  return NULL;
777  /*
778  * We want a regular kernel memory (not highmem or DMA etc)
779  * allocation for the radix tree nodes, but we need to honour
780  * the context-specific requirements the caller has asked for.
781  * GFP_RECLAIM_MASK collects those requirements.
782  */
783  err = add_to_page_cache_lru(page, mapping, index,
784  (gfp_mask & GFP_RECLAIM_MASK));
785  if (unlikely(err)) {
786  page_cache_release(page);
787  page = NULL;
788  if (err == -EEXIST)
789  goto repeat;
790  }
791  }
792  return page;
793 }
795 
813  unsigned int nr_pages, struct page **pages)
814 {
815  struct radix_tree_iter iter;
816  void **slot;
817  unsigned ret = 0;
818 
819  if (unlikely(!nr_pages))
820  return 0;
821 
822  rcu_read_lock();
823 restart:
824  radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
825  struct page *page;
826 repeat:
827  page = radix_tree_deref_slot(slot);
828  if (unlikely(!page))
829  continue;
830 
831  if (radix_tree_exception(page)) {
832  if (radix_tree_deref_retry(page)) {
833  /*
834  * Transient condition which can only trigger
835  * when entry at index 0 moves out of or back
836  * to root: none yet gotten, safe to restart.
837  */
838  WARN_ON(iter.index);
839  goto restart;
840  }
841  /*
842  * Otherwise, shmem/tmpfs must be storing a swap entry
843  * here as an exceptional entry: so skip over it -
844  * we only reach this from invalidate_mapping_pages().
845  */
846  continue;
847  }
848 
849  if (!page_cache_get_speculative(page))
850  goto repeat;
851 
852  /* Has the page moved? */
853  if (unlikely(page != *slot)) {
854  page_cache_release(page);
855  goto repeat;
856  }
857 
858  pages[ret] = page;
859  if (++ret == nr_pages)
860  break;
861  }
862 
863  rcu_read_unlock();
864  return ret;
865 }
866 
880  unsigned int nr_pages, struct page **pages)
881 {
882  struct radix_tree_iter iter;
883  void **slot;
884  unsigned int ret = 0;
885 
886  if (unlikely(!nr_pages))
887  return 0;
888 
889  rcu_read_lock();
890 restart:
891  radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
892  struct page *page;
893 repeat:
894  page = radix_tree_deref_slot(slot);
895  /* The hole, there no reason to continue */
896  if (unlikely(!page))
897  break;
898 
899  if (radix_tree_exception(page)) {
900  if (radix_tree_deref_retry(page)) {
901  /*
902  * Transient condition which can only trigger
903  * when entry at index 0 moves out of or back
904  * to root: none yet gotten, safe to restart.
905  */
906  goto restart;
907  }
908  /*
909  * Otherwise, shmem/tmpfs must be storing a swap entry
910  * here as an exceptional entry: so stop looking for
911  * contiguous pages.
912  */
913  break;
914  }
915 
916  if (!page_cache_get_speculative(page))
917  goto repeat;
918 
919  /* Has the page moved? */
920  if (unlikely(page != *slot)) {
921  page_cache_release(page);
922  goto repeat;
923  }
924 
925  /*
926  * must check mapping and index after taking the ref.
927  * otherwise we can get both false positives and false
928  * negatives, which is just confusing to the caller.
929  */
930  if (page->mapping == NULL || page->index != iter.index) {
931  page_cache_release(page);
932  break;
933  }
934 
935  pages[ret] = page;
936  if (++ret == nr_pages)
937  break;
938  }
939  rcu_read_unlock();
940  return ret;
941 }
943 
956  int tag, unsigned int nr_pages, struct page **pages)
957 {
958  struct radix_tree_iter iter;
959  void **slot;
960  unsigned ret = 0;
961 
962  if (unlikely(!nr_pages))
963  return 0;
964 
965  rcu_read_lock();
966 restart:
967  radix_tree_for_each_tagged(slot, &mapping->page_tree,
968  &iter, *index, tag) {
969  struct page *page;
970 repeat:
971  page = radix_tree_deref_slot(slot);
972  if (unlikely(!page))
973  continue;
974 
975  if (radix_tree_exception(page)) {
976  if (radix_tree_deref_retry(page)) {
977  /*
978  * Transient condition which can only trigger
979  * when entry at index 0 moves out of or back
980  * to root: none yet gotten, safe to restart.
981  */
982  goto restart;
983  }
984  /*
985  * This function is never used on a shmem/tmpfs
986  * mapping, so a swap entry won't be found here.
987  */
988  BUG();
989  }
990 
991  if (!page_cache_get_speculative(page))
992  goto repeat;
993 
994  /* Has the page moved? */
995  if (unlikely(page != *slot)) {
996  page_cache_release(page);
997  goto repeat;
998  }
999 
1000  pages[ret] = page;
1001  if (++ret == nr_pages)
1002  break;
1003  }
1004 
1005  rcu_read_unlock();
1006 
1007  if (ret)
1008  *index = pages[ret - 1]->index + 1;
1009 
1010  return ret;
1011 }
1013 
1027 struct page *
1029 {
1030  struct page *page = find_get_page(mapping, index);
1031 
1032  if (page) {
1033  if (trylock_page(page))
1034  return page;
1035  page_cache_release(page);
1036  return NULL;
1037  }
1038  page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1039  if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1040  page_cache_release(page);
1041  page = NULL;
1042  }
1043  return page;
1044 }
1046 
1047 /*
1048  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1049  * a _large_ part of the i/o request. Imagine the worst scenario:
1050  *
1051  * ---R__________________________________________B__________
1052  * ^ reading here ^ bad block(assume 4k)
1053  *
1054  * read(R) => miss => readahead(R...B) => media error => frustrating retries
1055  * => failing the whole request => read(R) => read(R+1) =>
1056  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
1057  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
1058  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
1059  *
1060  * It is going insane. Fix it by quickly scaling down the readahead size.
1061  */
1062 static void shrink_readahead_size_eio(struct file *filp,
1063  struct file_ra_state *ra)
1064 {
1065  ra->ra_pages /= 4;
1066 }
1067 
1081 static void do_generic_file_read(struct file *filp, loff_t *ppos,
1083 {
1084  struct address_space *mapping = filp->f_mapping;
1085  struct inode *inode = mapping->host;
1086  struct file_ra_state *ra = &filp->f_ra;
1087  pgoff_t index;
1088  pgoff_t last_index;
1089  pgoff_t prev_index;
1090  unsigned long offset; /* offset into pagecache page */
1091  unsigned int prev_offset;
1092  int error;
1093 
1094  index = *ppos >> PAGE_CACHE_SHIFT;
1095  prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1096  prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
1097  last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1098  offset = *ppos & ~PAGE_CACHE_MASK;
1099 
1100  for (;;) {
1101  struct page *page;
1102  pgoff_t end_index;
1103  loff_t isize;
1104  unsigned long nr, ret;
1105 
1106  cond_resched();
1107 find_page:
1108  page = find_get_page(mapping, index);
1109  if (!page) {
1110  page_cache_sync_readahead(mapping,
1111  ra, filp,
1112  index, last_index - index);
1113  page = find_get_page(mapping, index);
1114  if (unlikely(page == NULL))
1115  goto no_cached_page;
1116  }
1117  if (PageReadahead(page)) {
1119  ra, filp, page,
1120  index, last_index - index);
1121  }
1122  if (!PageUptodate(page)) {
1123  if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1124  !mapping->a_ops->is_partially_uptodate)
1125  goto page_not_up_to_date;
1126  if (!trylock_page(page))
1127  goto page_not_up_to_date;
1128  /* Did it get truncated before we got the lock? */
1129  if (!page->mapping)
1130  goto page_not_up_to_date_locked;
1131  if (!mapping->a_ops->is_partially_uptodate(page,
1132  desc, offset))
1133  goto page_not_up_to_date_locked;
1134  unlock_page(page);
1135  }
1136 page_ok:
1137  /*
1138  * i_size must be checked after we know the page is Uptodate.
1139  *
1140  * Checking i_size after the check allows us to calculate
1141  * the correct value for "nr", which means the zero-filled
1142  * part of the page is not copied back to userspace (unless
1143  * another truncate extends the file - this is desired though).
1144  */
1145 
1146  isize = i_size_read(inode);
1147  end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1148  if (unlikely(!isize || index > end_index)) {
1149  page_cache_release(page);
1150  goto out;
1151  }
1152 
1153  /* nr is the maximum number of bytes to copy from this page */
1154  nr = PAGE_CACHE_SIZE;
1155  if (index == end_index) {
1156  nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1157  if (nr <= offset) {
1158  page_cache_release(page);
1159  goto out;
1160  }
1161  }
1162  nr = nr - offset;
1163 
1164  /* If users can be writing to this page using arbitrary
1165  * virtual addresses, take care about potential aliasing
1166  * before reading the page on the kernel side.
1167  */
1168  if (mapping_writably_mapped(mapping))
1169  flush_dcache_page(page);
1170 
1171  /*
1172  * When a sequential read accesses a page several times,
1173  * only mark it as accessed the first time.
1174  */
1175  if (prev_index != index || offset != prev_offset)
1176  mark_page_accessed(page);
1177  prev_index = index;
1178 
1179  /*
1180  * Ok, we have the page, and it's up-to-date, so
1181  * now we can copy it to user space...
1182  *
1183  * The actor routine returns how many bytes were actually used..
1184  * NOTE! This may not be the same as how much of a user buffer
1185  * we filled up (we may be padding etc), so we can only update
1186  * "pos" here (the actor routine has to update the user buffer
1187  * pointers and the remaining count).
1188  */
1189  ret = actor(desc, page, offset, nr);
1190  offset += ret;
1191  index += offset >> PAGE_CACHE_SHIFT;
1192  offset &= ~PAGE_CACHE_MASK;
1193  prev_offset = offset;
1194 
1195  page_cache_release(page);
1196  if (ret == nr && desc->count)
1197  continue;
1198  goto out;
1199 
1200 page_not_up_to_date:
1201  /* Get exclusive access to the page ... */
1202  error = lock_page_killable(page);
1203  if (unlikely(error))
1204  goto readpage_error;
1205 
1206 page_not_up_to_date_locked:
1207  /* Did it get truncated before we got the lock? */
1208  if (!page->mapping) {
1209  unlock_page(page);
1210  page_cache_release(page);
1211  continue;
1212  }
1213 
1214  /* Did somebody else fill it already? */
1215  if (PageUptodate(page)) {
1216  unlock_page(page);
1217  goto page_ok;
1218  }
1219 
1220 readpage:
1221  /*
1222  * A previous I/O error may have been due to temporary
1223  * failures, eg. multipath errors.
1224  * PG_error will be set again if readpage fails.
1225  */
1226  ClearPageError(page);
1227  /* Start the actual read. The read will unlock the page. */
1228  error = mapping->a_ops->readpage(filp, page);
1229 
1230  if (unlikely(error)) {
1231  if (error == AOP_TRUNCATED_PAGE) {
1232  page_cache_release(page);
1233  goto find_page;
1234  }
1235  goto readpage_error;
1236  }
1237 
1238  if (!PageUptodate(page)) {
1239  error = lock_page_killable(page);
1240  if (unlikely(error))
1241  goto readpage_error;
1242  if (!PageUptodate(page)) {
1243  if (page->mapping == NULL) {
1244  /*
1245  * invalidate_mapping_pages got it
1246  */
1247  unlock_page(page);
1248  page_cache_release(page);
1249  goto find_page;
1250  }
1251  unlock_page(page);
1252  shrink_readahead_size_eio(filp, ra);
1253  error = -EIO;
1254  goto readpage_error;
1255  }
1256  unlock_page(page);
1257  }
1258 
1259  goto page_ok;
1260 
1261 readpage_error:
1262  /* UHHUH! A synchronous read error occurred. Report it */
1263  desc->error = error;
1264  page_cache_release(page);
1265  goto out;
1266 
1267 no_cached_page:
1268  /*
1269  * Ok, it wasn't cached, so we need to create a new
1270  * page..
1271  */
1272  page = page_cache_alloc_cold(mapping);
1273  if (!page) {
1274  desc->error = -ENOMEM;
1275  goto out;
1276  }
1277  error = add_to_page_cache_lru(page, mapping,
1278  index, GFP_KERNEL);
1279  if (error) {
1280  page_cache_release(page);
1281  if (error == -EEXIST)
1282  goto find_page;
1283  desc->error = error;
1284  goto out;
1285  }
1286  goto readpage;
1287  }
1288 
1289 out:
1290  ra->prev_pos = prev_index;
1291  ra->prev_pos <<= PAGE_CACHE_SHIFT;
1292  ra->prev_pos |= prev_offset;
1293 
1294  *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1295  file_accessed(filp);
1296 }
1297 
1298 int file_read_actor(read_descriptor_t *desc, struct page *page,
1299  unsigned long offset, unsigned long size)
1300 {
1301  char *kaddr;
1302  unsigned long left, count = desc->count;
1303 
1304  if (size > count)
1305  size = count;
1306 
1307  /*
1308  * Faults on the destination of a read are common, so do it before
1309  * taking the kmap.
1310  */
1311  if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1312  kaddr = kmap_atomic(page);
1313  left = __copy_to_user_inatomic(desc->arg.buf,
1314  kaddr + offset, size);
1315  kunmap_atomic(kaddr);
1316  if (left == 0)
1317  goto success;
1318  }
1319 
1320  /* Do it the slow way */
1321  kaddr = kmap(page);
1322  left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1323  kunmap(page);
1324 
1325  if (left) {
1326  size -= left;
1327  desc->error = -EFAULT;
1328  }
1329 success:
1330  desc->count = count - size;
1331  desc->written += size;
1332  desc->arg.buf += size;
1333  return size;
1334 }
1335 
1336 /*
1337  * Performs necessary checks before doing a write
1338  * @iov: io vector request
1339  * @nr_segs: number of segments in the iovec
1340  * @count: number of bytes to write
1341  * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1342  *
1343  * Adjust number of segments and amount of bytes to write (nr_segs should be
1344  * properly initialized first). Returns appropriate error code that caller
1345  * should return or zero in case that write should be allowed.
1346  */
1347 int generic_segment_checks(const struct iovec *iov,
1348  unsigned long *nr_segs, size_t *count, int access_flags)
1349 {
1350  unsigned long seg;
1351  size_t cnt = 0;
1352  for (seg = 0; seg < *nr_segs; seg++) {
1353  const struct iovec *iv = &iov[seg];
1354 
1355  /*
1356  * If any segment has a negative length, or the cumulative
1357  * length ever wraps negative then return -EINVAL.
1358  */
1359  cnt += iv->iov_len;
1360  if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1361  return -EINVAL;
1362  if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1363  continue;
1364  if (seg == 0)
1365  return -EFAULT;
1366  *nr_segs = seg;
1367  cnt -= iv->iov_len; /* This segment is no good */
1368  break;
1369  }
1370  *count = cnt;
1371  return 0;
1372 }
1374 
1385 ssize_t
1386 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1387  unsigned long nr_segs, loff_t pos)
1388 {
1389  struct file *filp = iocb->ki_filp;
1390  ssize_t retval;
1391  unsigned long seg = 0;
1392  size_t count;
1393  loff_t *ppos = &iocb->ki_pos;
1394 
1395  count = 0;
1396  retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1397  if (retval)
1398  return retval;
1399 
1400  /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1401  if (filp->f_flags & O_DIRECT) {
1402  loff_t size;
1403  struct address_space *mapping;
1404  struct inode *inode;
1405 
1406  mapping = filp->f_mapping;
1407  inode = mapping->host;
1408  if (!count)
1409  goto out; /* skip atime */
1410  size = i_size_read(inode);
1411  if (pos < size) {
1412  retval = filemap_write_and_wait_range(mapping, pos,
1413  pos + iov_length(iov, nr_segs) - 1);
1414  if (!retval) {
1415  retval = mapping->a_ops->direct_IO(READ, iocb,
1416  iov, pos, nr_segs);
1417  }
1418  if (retval > 0) {
1419  *ppos = pos + retval;
1420  count -= retval;
1421  }
1422 
1423  /*
1424  * Btrfs can have a short DIO read if we encounter
1425  * compressed extents, so if there was an error, or if
1426  * we've already read everything we wanted to, or if
1427  * there was a short read because we hit EOF, go ahead
1428  * and return. Otherwise fallthrough to buffered io for
1429  * the rest of the read.
1430  */
1431  if (retval < 0 || !count || *ppos >= size) {
1432  file_accessed(filp);
1433  goto out;
1434  }
1435  }
1436  }
1437 
1438  count = retval;
1439  for (seg = 0; seg < nr_segs; seg++) {
1440  read_descriptor_t desc;
1441  loff_t offset = 0;
1442 
1443  /*
1444  * If we did a short DIO read we need to skip the section of the
1445  * iov that we've already read data into.
1446  */
1447  if (count) {
1448  if (count > iov[seg].iov_len) {
1449  count -= iov[seg].iov_len;
1450  continue;
1451  }
1452  offset = count;
1453  count = 0;
1454  }
1455 
1456  desc.written = 0;
1457  desc.arg.buf = iov[seg].iov_base + offset;
1458  desc.count = iov[seg].iov_len - offset;
1459  if (desc.count == 0)
1460  continue;
1461  desc.error = 0;
1462  do_generic_file_read(filp, ppos, &desc, file_read_actor);
1463  retval += desc.written;
1464  if (desc.error) {
1465  retval = retval ?: desc.error;
1466  break;
1467  }
1468  if (desc.count > 0)
1469  break;
1470  }
1471 out:
1472  return retval;
1473 }
1475 
1476 #ifdef CONFIG_MMU
1477 
1485 static int page_cache_read(struct file *file, pgoff_t offset)
1486 {
1487  struct address_space *mapping = file->f_mapping;
1488  struct page *page;
1489  int ret;
1490 
1491  do {
1492  page = page_cache_alloc_cold(mapping);
1493  if (!page)
1494  return -ENOMEM;
1495 
1496  ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1497  if (ret == 0)
1498  ret = mapping->a_ops->readpage(file, page);
1499  else if (ret == -EEXIST)
1500  ret = 0; /* losing race to add is OK */
1501 
1502  page_cache_release(page);
1503 
1504  } while (ret == AOP_TRUNCATED_PAGE);
1505 
1506  return ret;
1507 }
1508 
1509 #define MMAP_LOTSAMISS (100)
1510 
1511 /*
1512  * Synchronous readahead happens when we don't even find
1513  * a page in the page cache at all.
1514  */
1515 static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1516  struct file_ra_state *ra,
1517  struct file *file,
1518  pgoff_t offset)
1519 {
1520  unsigned long ra_pages;
1521  struct address_space *mapping = file->f_mapping;
1522 
1523  /* If we don't want any read-ahead, don't bother */
1524  if (VM_RandomReadHint(vma))
1525  return;
1526  if (!ra->ra_pages)
1527  return;
1528 
1529  if (VM_SequentialReadHint(vma)) {
1530  page_cache_sync_readahead(mapping, ra, file, offset,
1531  ra->ra_pages);
1532  return;
1533  }
1534 
1535  /* Avoid banging the cache line if not needed */
1536  if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
1537  ra->mmap_miss++;
1538 
1539  /*
1540  * Do we miss much more than hit in this file? If so,
1541  * stop bothering with read-ahead. It will only hurt.
1542  */
1543  if (ra->mmap_miss > MMAP_LOTSAMISS)
1544  return;
1545 
1546  /*
1547  * mmap read-around
1548  */
1549  ra_pages = max_sane_readahead(ra->ra_pages);
1550  ra->start = max_t(long, 0, offset - ra_pages / 2);
1551  ra->size = ra_pages;
1552  ra->async_size = ra_pages / 4;
1553  ra_submit(ra, mapping, file);
1554 }
1555 
1556 /*
1557  * Asynchronous readahead happens when we find the page and PG_readahead,
1558  * so we want to possibly extend the readahead further..
1559  */
1560 static void do_async_mmap_readahead(struct vm_area_struct *vma,
1561  struct file_ra_state *ra,
1562  struct file *file,
1563  struct page *page,
1564  pgoff_t offset)
1565 {
1566  struct address_space *mapping = file->f_mapping;
1567 
1568  /* If we don't want any read-ahead, don't bother */
1569  if (VM_RandomReadHint(vma))
1570  return;
1571  if (ra->mmap_miss > 0)
1572  ra->mmap_miss--;
1573  if (PageReadahead(page))
1574  page_cache_async_readahead(mapping, ra, file,
1575  page, offset, ra->ra_pages);
1576 }
1577 
1590 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1591 {
1592  int error;
1593  struct file *file = vma->vm_file;
1594  struct address_space *mapping = file->f_mapping;
1595  struct file_ra_state *ra = &file->f_ra;
1596  struct inode *inode = mapping->host;
1597  pgoff_t offset = vmf->pgoff;
1598  struct page *page;
1599  pgoff_t size;
1600  int ret = 0;
1601 
1602  size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1603  if (offset >= size)
1604  return VM_FAULT_SIGBUS;
1605 
1606  /*
1607  * Do we have something in the page cache already?
1608  */
1609  page = find_get_page(mapping, offset);
1610  if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1611  /*
1612  * We found the page, so try async readahead before
1613  * waiting for the lock.
1614  */
1615  do_async_mmap_readahead(vma, ra, file, page, offset);
1616  } else if (!page) {
1617  /* No page in the page cache at all */
1618  do_sync_mmap_readahead(vma, ra, file, offset);
1619  count_vm_event(PGMAJFAULT);
1621  ret = VM_FAULT_MAJOR;
1622 retry_find:
1623  page = find_get_page(mapping, offset);
1624  if (!page)
1625  goto no_cached_page;
1626  }
1627 
1628  if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
1629  page_cache_release(page);
1630  return ret | VM_FAULT_RETRY;
1631  }
1632 
1633  /* Did it get truncated? */
1634  if (unlikely(page->mapping != mapping)) {
1635  unlock_page(page);
1636  put_page(page);
1637  goto retry_find;
1638  }
1639  VM_BUG_ON(page->index != offset);
1640 
1641  /*
1642  * We have a locked page in the page cache, now we need to check
1643  * that it's up-to-date. If not, it is going to be due to an error.
1644  */
1645  if (unlikely(!PageUptodate(page)))
1646  goto page_not_uptodate;
1647 
1648  /*
1649  * Found the page and have a reference on it.
1650  * We must recheck i_size under page lock.
1651  */
1652  size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1653  if (unlikely(offset >= size)) {
1654  unlock_page(page);
1655  page_cache_release(page);
1656  return VM_FAULT_SIGBUS;
1657  }
1658 
1659  vmf->page = page;
1660  return ret | VM_FAULT_LOCKED;
1661 
1662 no_cached_page:
1663  /*
1664  * We're only likely to ever get here if MADV_RANDOM is in
1665  * effect.
1666  */
1667  error = page_cache_read(file, offset);
1668 
1669  /*
1670  * The page we want has now been added to the page cache.
1671  * In the unlikely event that someone removed it in the
1672  * meantime, we'll just come back here and read it again.
1673  */
1674  if (error >= 0)
1675  goto retry_find;
1676 
1677  /*
1678  * An error return from page_cache_read can result if the
1679  * system is low on memory, or a problem occurs while trying
1680  * to schedule I/O.
1681  */
1682  if (error == -ENOMEM)
1683  return VM_FAULT_OOM;
1684  return VM_FAULT_SIGBUS;
1685 
1686 page_not_uptodate:
1687  /*
1688  * Umm, take care of errors if the page isn't up-to-date.
1689  * Try to re-read it _once_. We do this synchronously,
1690  * because there really aren't any performance issues here
1691  * and we need to check for errors.
1692  */
1693  ClearPageError(page);
1694  error = mapping->a_ops->readpage(file, page);
1695  if (!error) {
1696  wait_on_page_locked(page);
1697  if (!PageUptodate(page))
1698  error = -EIO;
1699  }
1700  page_cache_release(page);
1701 
1702  if (!error || error == AOP_TRUNCATED_PAGE)
1703  goto retry_find;
1704 
1705  /* Things didn't work out. Return zero to tell the mm layer so. */
1706  shrink_readahead_size_eio(file, ra);
1707  return VM_FAULT_SIGBUS;
1708 }
1710 
1711 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1712 {
1713  struct page *page = vmf->page;
1714  struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1715  int ret = VM_FAULT_LOCKED;
1716 
1717  sb_start_pagefault(inode->i_sb);
1718  file_update_time(vma->vm_file);
1719  lock_page(page);
1720  if (page->mapping != inode->i_mapping) {
1721  unlock_page(page);
1722  ret = VM_FAULT_NOPAGE;
1723  goto out;
1724  }
1725  /*
1726  * We mark the page dirty already here so that when freeze is in
1727  * progress, we are guaranteed that writeback during freezing will
1728  * see the dirty page and writeprotect it again.
1729  */
1730  set_page_dirty(page);
1731 out:
1732  sb_end_pagefault(inode->i_sb);
1733  return ret;
1734 }
1735 EXPORT_SYMBOL(filemap_page_mkwrite);
1736 
1737 const struct vm_operations_struct generic_file_vm_ops = {
1738  .fault = filemap_fault,
1739  .page_mkwrite = filemap_page_mkwrite,
1740  .remap_pages = generic_file_remap_pages,
1741 };
1742 
1743 /* This is used for a general mmap of a disk file */
1744 
1745 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1746 {
1747  struct address_space *mapping = file->f_mapping;
1748 
1749  if (!mapping->a_ops->readpage)
1750  return -ENOEXEC;
1751  file_accessed(file);
1752  vma->vm_ops = &generic_file_vm_ops;
1753  return 0;
1754 }
1755 
1756 /*
1757  * This is for filesystems which do not implement ->writepage.
1758  */
1759 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1760 {
1761  if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1762  return -EINVAL;
1763  return generic_file_mmap(file, vma);
1764 }
1765 #else
1766 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1767 {
1768  return -ENOSYS;
1769 }
1770 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1771 {
1772  return -ENOSYS;
1773 }
1774 #endif /* CONFIG_MMU */
1775 
1778 
1779 static struct page *__read_cache_page(struct address_space *mapping,
1780  pgoff_t index,
1781  int (*filler)(void *, struct page *),
1782  void *data,
1783  gfp_t gfp)
1784 {
1785  struct page *page;
1786  int err;
1787 repeat:
1788  page = find_get_page(mapping, index);
1789  if (!page) {
1790  page = __page_cache_alloc(gfp | __GFP_COLD);
1791  if (!page)
1792  return ERR_PTR(-ENOMEM);
1793  err = add_to_page_cache_lru(page, mapping, index, gfp);
1794  if (unlikely(err)) {
1795  page_cache_release(page);
1796  if (err == -EEXIST)
1797  goto repeat;
1798  /* Presumably ENOMEM for radix tree node */
1799  return ERR_PTR(err);
1800  }
1801  err = filler(data, page);
1802  if (err < 0) {
1803  page_cache_release(page);
1804  page = ERR_PTR(err);
1805  }
1806  }
1807  return page;
1808 }
1809 
1810 static struct page *do_read_cache_page(struct address_space *mapping,
1811  pgoff_t index,
1812  int (*filler)(void *, struct page *),
1813  void *data,
1814  gfp_t gfp)
1815 
1816 {
1817  struct page *page;
1818  int err;
1819 
1820 retry:
1821  page = __read_cache_page(mapping, index, filler, data, gfp);
1822  if (IS_ERR(page))
1823  return page;
1824  if (PageUptodate(page))
1825  goto out;
1826 
1827  lock_page(page);
1828  if (!page->mapping) {
1829  unlock_page(page);
1830  page_cache_release(page);
1831  goto retry;
1832  }
1833  if (PageUptodate(page)) {
1834  unlock_page(page);
1835  goto out;
1836  }
1837  err = filler(data, page);
1838  if (err < 0) {
1839  page_cache_release(page);
1840  return ERR_PTR(err);
1841  }
1842 out:
1843  mark_page_accessed(page);
1844  return page;
1845 }
1846 
1862 struct page *read_cache_page_async(struct address_space *mapping,
1863  pgoff_t index,
1864  int (*filler)(void *, struct page *),
1865  void *data)
1866 {
1867  return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1868 }
1870 
1871 static struct page *wait_on_page_read(struct page *page)
1872 {
1873  if (!IS_ERR(page)) {
1874  wait_on_page_locked(page);
1875  if (!PageUptodate(page)) {
1876  page_cache_release(page);
1877  page = ERR_PTR(-EIO);
1878  }
1879  }
1880  return page;
1881 }
1882 
1894 struct page *read_cache_page_gfp(struct address_space *mapping,
1895  pgoff_t index,
1896  gfp_t gfp)
1897 {
1898  filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1899 
1900  return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1901 }
1903 
1916 struct page *read_cache_page(struct address_space *mapping,
1917  pgoff_t index,
1918  int (*filler)(void *, struct page *),
1919  void *data)
1920 {
1921  return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1922 }
1924 
1925 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1926  const struct iovec *iov, size_t base, size_t bytes)
1927 {
1928  size_t copied = 0, left = 0;
1929 
1930  while (bytes) {
1931  char __user *buf = iov->iov_base + base;
1932  int copy = min(bytes, iov->iov_len - base);
1933 
1934  base = 0;
1935  left = __copy_from_user_inatomic(vaddr, buf, copy);
1936  copied += copy;
1937  bytes -= copy;
1938  vaddr += copy;
1939  iov++;
1940 
1941  if (unlikely(left))
1942  break;
1943  }
1944  return copied - left;
1945 }
1946 
1947 /*
1948  * Copy as much as we can into the page and return the number of bytes which
1949  * were successfully copied. If a fault is encountered then return the number of
1950  * bytes which were copied.
1951  */
1952 size_t iov_iter_copy_from_user_atomic(struct page *page,
1953  struct iov_iter *i, unsigned long offset, size_t bytes)
1954 {
1955  char *kaddr;
1956  size_t copied;
1957 
1958  BUG_ON(!in_atomic());
1959  kaddr = kmap_atomic(page);
1960  if (likely(i->nr_segs == 1)) {
1961  int left;
1962  char __user *buf = i->iov->iov_base + i->iov_offset;
1963  left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1964  copied = bytes - left;
1965  } else {
1966  copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1967  i->iov, i->iov_offset, bytes);
1968  }
1969  kunmap_atomic(kaddr);
1970 
1971  return copied;
1972 }
1974 
1975 /*
1976  * This has the same sideeffects and return value as
1977  * iov_iter_copy_from_user_atomic().
1978  * The difference is that it attempts to resolve faults.
1979  * Page must not be locked.
1980  */
1981 size_t iov_iter_copy_from_user(struct page *page,
1982  struct iov_iter *i, unsigned long offset, size_t bytes)
1983 {
1984  char *kaddr;
1985  size_t copied;
1986 
1987  kaddr = kmap(page);
1988  if (likely(i->nr_segs == 1)) {
1989  int left;
1990  char __user *buf = i->iov->iov_base + i->iov_offset;
1991  left = __copy_from_user(kaddr + offset, buf, bytes);
1992  copied = bytes - left;
1993  } else {
1994  copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1995  i->iov, i->iov_offset, bytes);
1996  }
1997  kunmap(page);
1998  return copied;
1999 }
2001 
2002 void iov_iter_advance(struct iov_iter *i, size_t bytes)
2003 {
2004  BUG_ON(i->count < bytes);
2005 
2006  if (likely(i->nr_segs == 1)) {
2007  i->iov_offset += bytes;
2008  i->count -= bytes;
2009  } else {
2010  const struct iovec *iov = i->iov;
2011  size_t base = i->iov_offset;
2012  unsigned long nr_segs = i->nr_segs;
2013 
2014  /*
2015  * The !iov->iov_len check ensures we skip over unlikely
2016  * zero-length segments (without overruning the iovec).
2017  */
2018  while (bytes || unlikely(i->count && !iov->iov_len)) {
2019  int copy;
2020 
2021  copy = min(bytes, iov->iov_len - base);
2022  BUG_ON(!i->count || i->count < copy);
2023  i->count -= copy;
2024  bytes -= copy;
2025  base += copy;
2026  if (iov->iov_len == base) {
2027  iov++;
2028  nr_segs--;
2029  base = 0;
2030  }
2031  }
2032  i->iov = iov;
2033  i->iov_offset = base;
2034  i->nr_segs = nr_segs;
2035  }
2036 }
2038 
2039 /*
2040  * Fault in the first iovec of the given iov_iter, to a maximum length
2041  * of bytes. Returns 0 on success, or non-zero if the memory could not be
2042  * accessed (ie. because it is an invalid address).
2043  *
2044  * writev-intensive code may want this to prefault several iovecs -- that
2045  * would be possible (callers must not rely on the fact that _only_ the
2046  * first iovec will be faulted with the current implementation).
2047  */
2048 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
2049 {
2050  char __user *buf = i->iov->iov_base + i->iov_offset;
2051  bytes = min(bytes, i->iov->iov_len - i->iov_offset);
2052  return fault_in_pages_readable(buf, bytes);
2053 }
2055 
2056 /*
2057  * Return the count of just the current iov_iter segment.
2058  */
2060 {
2061  const struct iovec *iov = i->iov;
2062  if (i->nr_segs == 1)
2063  return i->count;
2064  else
2065  return min(i->count, iov->iov_len - i->iov_offset);
2066 }
2068 
2069 /*
2070  * Performs necessary checks before doing a write
2071  *
2072  * Can adjust writing position or amount of bytes to write.
2073  * Returns appropriate error code that caller should return or
2074  * zero in case that write should be allowed.
2075  */
2076 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
2077 {
2078  struct inode *inode = file->f_mapping->host;
2079  unsigned long limit = rlimit(RLIMIT_FSIZE);
2080 
2081  if (unlikely(*pos < 0))
2082  return -EINVAL;
2083 
2084  if (!isblk) {
2085  /* FIXME: this is for backwards compatibility with 2.4 */
2086  if (file->f_flags & O_APPEND)
2087  *pos = i_size_read(inode);
2088 
2089  if (limit != RLIM_INFINITY) {
2090  if (*pos >= limit) {
2091  send_sig(SIGXFSZ, current, 0);
2092  return -EFBIG;
2093  }
2094  if (*count > limit - (typeof(limit))*pos) {
2095  *count = limit - (typeof(limit))*pos;
2096  }
2097  }
2098  }
2099 
2100  /*
2101  * LFS rule
2102  */
2103  if (unlikely(*pos + *count > MAX_NON_LFS &&
2104  !(file->f_flags & O_LARGEFILE))) {
2105  if (*pos >= MAX_NON_LFS) {
2106  return -EFBIG;
2107  }
2108  if (*count > MAX_NON_LFS - (unsigned long)*pos) {
2109  *count = MAX_NON_LFS - (unsigned long)*pos;
2110  }
2111  }
2112 
2113  /*
2114  * Are we about to exceed the fs block limit ?
2115  *
2116  * If we have written data it becomes a short write. If we have
2117  * exceeded without writing data we send a signal and return EFBIG.
2118  * Linus frestrict idea will clean these up nicely..
2119  */
2120  if (likely(!isblk)) {
2121  if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2122  if (*count || *pos > inode->i_sb->s_maxbytes) {
2123  return -EFBIG;
2124  }
2125  /* zero-length writes at ->s_maxbytes are OK */
2126  }
2127 
2128  if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2129  *count = inode->i_sb->s_maxbytes - *pos;
2130  } else {
2131 #ifdef CONFIG_BLOCK
2132  loff_t isize;
2133  if (bdev_read_only(I_BDEV(inode)))
2134  return -EPERM;
2135  isize = i_size_read(inode);
2136  if (*pos >= isize) {
2137  if (*count || *pos > isize)
2138  return -ENOSPC;
2139  }
2140 
2141  if (*pos + *count > isize)
2142  *count = isize - *pos;
2143 #else
2144  return -EPERM;
2145 #endif
2146  }
2147  return 0;
2148 }
2150 
2151 int pagecache_write_begin(struct file *file, struct address_space *mapping,
2152  loff_t pos, unsigned len, unsigned flags,
2153  struct page **pagep, void **fsdata)
2154 {
2155  const struct address_space_operations *aops = mapping->a_ops;
2156 
2157  return aops->write_begin(file, mapping, pos, len, flags,
2158  pagep, fsdata);
2159 }
2161 
2162 int pagecache_write_end(struct file *file, struct address_space *mapping,
2163  loff_t pos, unsigned len, unsigned copied,
2164  struct page *page, void *fsdata)
2165 {
2166  const struct address_space_operations *aops = mapping->a_ops;
2167 
2168  mark_page_accessed(page);
2169  return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2170 }
2172 
2173 ssize_t
2174 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2175  unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2176  size_t count, size_t ocount)
2177 {
2178  struct file *file = iocb->ki_filp;
2179  struct address_space *mapping = file->f_mapping;
2180  struct inode *inode = mapping->host;
2181  ssize_t written;
2182  size_t write_len;
2183  pgoff_t end;
2184 
2185  if (count != ocount)
2186  *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2187 
2188  write_len = iov_length(iov, *nr_segs);
2189  end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2190 
2191  written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2192  if (written)
2193  goto out;
2194 
2195  /*
2196  * After a write we want buffered reads to be sure to go to disk to get
2197  * the new data. We invalidate clean cached page from the region we're
2198  * about to write. We do this *before* the write so that we can return
2199  * without clobbering -EIOCBQUEUED from ->direct_IO().
2200  */
2201  if (mapping->nrpages) {
2202  written = invalidate_inode_pages2_range(mapping,
2203  pos >> PAGE_CACHE_SHIFT, end);
2204  /*
2205  * If a page can not be invalidated, return 0 to fall back
2206  * to buffered write.
2207  */
2208  if (written) {
2209  if (written == -EBUSY)
2210  return 0;
2211  goto out;
2212  }
2213  }
2214 
2215  written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2216 
2217  /*
2218  * Finally, try again to invalidate clean pages which might have been
2219  * cached by non-direct readahead, or faulted in by get_user_pages()
2220  * if the source of the write was an mmap'ed region of the file
2221  * we're writing. Either one is a pretty crazy thing to do,
2222  * so we don't support it 100%. If this invalidation
2223  * fails, tough, the write still worked...
2224  */
2225  if (mapping->nrpages) {
2227  pos >> PAGE_CACHE_SHIFT, end);
2228  }
2229 
2230  if (written > 0) {
2231  pos += written;
2232  if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2233  i_size_write(inode, pos);
2234  mark_inode_dirty(inode);
2235  }
2236  *ppos = pos;
2237  }
2238 out:
2239  return written;
2240 }
2242 
2243 /*
2244  * Find or create a page at the given pagecache position. Return the locked
2245  * page. This function is specifically for buffered writes.
2246  */
2247 struct page *grab_cache_page_write_begin(struct address_space *mapping,
2248  pgoff_t index, unsigned flags)
2249 {
2250  int status;
2251  gfp_t gfp_mask;
2252  struct page *page;
2253  gfp_t gfp_notmask = 0;
2254 
2255  gfp_mask = mapping_gfp_mask(mapping);
2256  if (mapping_cap_account_dirty(mapping))
2257  gfp_mask |= __GFP_WRITE;
2258  if (flags & AOP_FLAG_NOFS)
2259  gfp_notmask = __GFP_FS;
2260 repeat:
2261  page = find_lock_page(mapping, index);
2262  if (page)
2263  goto found;
2264 
2265  page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2266  if (!page)
2267  return NULL;
2268  status = add_to_page_cache_lru(page, mapping, index,
2269  GFP_KERNEL & ~gfp_notmask);
2270  if (unlikely(status)) {
2271  page_cache_release(page);
2272  if (status == -EEXIST)
2273  goto repeat;
2274  return NULL;
2275  }
2276 found:
2277  wait_on_page_writeback(page);
2278  return page;
2279 }
2281 
2282 static ssize_t generic_perform_write(struct file *file,
2283  struct iov_iter *i, loff_t pos)
2284 {
2285  struct address_space *mapping = file->f_mapping;
2286  const struct address_space_operations *a_ops = mapping->a_ops;
2287  long status = 0;
2288  ssize_t written = 0;
2289  unsigned int flags = 0;
2290 
2291  /*
2292  * Copies from kernel address space cannot fail (NFSD is a big user).
2293  */
2294  if (segment_eq(get_fs(), KERNEL_DS))
2295  flags |= AOP_FLAG_UNINTERRUPTIBLE;
2296 
2297  do {
2298  struct page *page;
2299  unsigned long offset; /* Offset into pagecache page */
2300  unsigned long bytes; /* Bytes to write to page */
2301  size_t copied; /* Bytes copied from user */
2302  void *fsdata;
2303 
2304  offset = (pos & (PAGE_CACHE_SIZE - 1));
2305  bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2306  iov_iter_count(i));
2307 
2308 again:
2309  /*
2310  * Bring in the user page that we will copy from _first_.
2311  * Otherwise there's a nasty deadlock on copying from the
2312  * same page as we're writing to, without it being marked
2313  * up-to-date.
2314  *
2315  * Not only is this an optimisation, but it is also required
2316  * to check that the address is actually valid, when atomic
2317  * usercopies are used, below.
2318  */
2319  if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2320  status = -EFAULT;
2321  break;
2322  }
2323 
2324  status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2325  &page, &fsdata);
2326  if (unlikely(status))
2327  break;
2328 
2329  if (mapping_writably_mapped(mapping))
2330  flush_dcache_page(page);
2331 
2332  pagefault_disable();
2333  copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2334  pagefault_enable();
2335  flush_dcache_page(page);
2336 
2337  mark_page_accessed(page);
2338  status = a_ops->write_end(file, mapping, pos, bytes, copied,
2339  page, fsdata);
2340  if (unlikely(status < 0))
2341  break;
2342  copied = status;
2343 
2344  cond_resched();
2345 
2346  iov_iter_advance(i, copied);
2347  if (unlikely(copied == 0)) {
2348  /*
2349  * If we were unable to copy any data at all, we must
2350  * fall back to a single segment length write.
2351  *
2352  * If we didn't fallback here, we could livelock
2353  * because not all segments in the iov can be copied at
2354  * once without a pagefault.
2355  */
2356  bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2358  goto again;
2359  }
2360  pos += copied;
2361  written += copied;
2362 
2363  balance_dirty_pages_ratelimited(mapping);
2364  if (fatal_signal_pending(current)) {
2365  status = -EINTR;
2366  break;
2367  }
2368  } while (iov_iter_count(i));
2369 
2370  return written ? written : status;
2371 }
2372 
2373 ssize_t
2374 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2375  unsigned long nr_segs, loff_t pos, loff_t *ppos,
2376  size_t count, ssize_t written)
2377 {
2378  struct file *file = iocb->ki_filp;
2379  ssize_t status;
2380  struct iov_iter i;
2381 
2382  iov_iter_init(&i, iov, nr_segs, count, written);
2383  status = generic_perform_write(file, &i, pos);
2384 
2385  if (likely(status >= 0)) {
2386  written += status;
2387  *ppos = pos + status;
2388  }
2389 
2390  return written ? written : status;
2391 }
2393 
2413 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2414  unsigned long nr_segs, loff_t *ppos)
2415 {
2416  struct file *file = iocb->ki_filp;
2417  struct address_space * mapping = file->f_mapping;
2418  size_t ocount; /* original count */
2419  size_t count; /* after file limit checks */
2420  struct inode *inode = mapping->host;
2421  loff_t pos;
2422  ssize_t written;
2423  ssize_t err;
2424 
2425  ocount = 0;
2426  err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2427  if (err)
2428  return err;
2429 
2430  count = ocount;
2431  pos = *ppos;
2432 
2433  /* We can write back this queue in page reclaim */
2434  current->backing_dev_info = mapping->backing_dev_info;
2435  written = 0;
2436 
2437  err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2438  if (err)
2439  goto out;
2440 
2441  if (count == 0)
2442  goto out;
2443 
2444  err = file_remove_suid(file);
2445  if (err)
2446  goto out;
2447 
2448  err = file_update_time(file);
2449  if (err)
2450  goto out;
2451 
2452  /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2453  if (unlikely(file->f_flags & O_DIRECT)) {
2454  loff_t endbyte;
2455  ssize_t written_buffered;
2456 
2457  written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2458  ppos, count, ocount);
2459  if (written < 0 || written == count)
2460  goto out;
2461  /*
2462  * direct-io write to a hole: fall through to buffered I/O
2463  * for completing the rest of the request.
2464  */
2465  pos += written;
2466  count -= written;
2467  written_buffered = generic_file_buffered_write(iocb, iov,
2468  nr_segs, pos, ppos, count,
2469  written);
2470  /*
2471  * If generic_file_buffered_write() retuned a synchronous error
2472  * then we want to return the number of bytes which were
2473  * direct-written, or the error code if that was zero. Note
2474  * that this differs from normal direct-io semantics, which
2475  * will return -EFOO even if some bytes were written.
2476  */
2477  if (written_buffered < 0) {
2478  err = written_buffered;
2479  goto out;
2480  }
2481 
2482  /*
2483  * We need to ensure that the page cache pages are written to
2484  * disk and invalidated to preserve the expected O_DIRECT
2485  * semantics.
2486  */
2487  endbyte = pos + written_buffered - written - 1;
2488  err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2489  if (err == 0) {
2490  written = written_buffered;
2491  invalidate_mapping_pages(mapping,
2492  pos >> PAGE_CACHE_SHIFT,
2493  endbyte >> PAGE_CACHE_SHIFT);
2494  } else {
2495  /*
2496  * We don't know how much we wrote, so just return
2497  * the number of bytes which were direct-written
2498  */
2499  }
2500  } else {
2501  written = generic_file_buffered_write(iocb, iov, nr_segs,
2502  pos, ppos, count, written);
2503  }
2504 out:
2505  current->backing_dev_info = NULL;
2506  return written ? written : err;
2507 }
2509 
2521 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2522  unsigned long nr_segs, loff_t pos)
2523 {
2524  struct file *file = iocb->ki_filp;
2525  struct inode *inode = file->f_mapping->host;
2526  ssize_t ret;
2527 
2528  BUG_ON(iocb->ki_pos != pos);
2529 
2530  sb_start_write(inode->i_sb);
2531  mutex_lock(&inode->i_mutex);
2532  ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2533  mutex_unlock(&inode->i_mutex);
2534 
2535  if (ret > 0 || ret == -EIOCBQUEUED) {
2536  ssize_t err;
2537 
2538  err = generic_write_sync(file, pos, ret);
2539  if (err < 0 && ret > 0)
2540  ret = err;
2541  }
2542  sb_end_write(inode->i_sb);
2543  return ret;
2544 }
2546 
2564 int try_to_release_page(struct page *page, gfp_t gfp_mask)
2565 {
2566  struct address_space * const mapping = page->mapping;
2567 
2568  BUG_ON(!PageLocked(page));
2569  if (PageWriteback(page))
2570  return 0;
2571 
2572  if (mapping && mapping->a_ops->releasepage)
2573  return mapping->a_ops->releasepage(page, gfp_mask);
2574  return try_to_free_buffers(page);
2575 }
2576