Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xfs_buf.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include "xfs.h"
19 #include <linux/stddef.h>
20 #include <linux/errno.h>
21 #include <linux/gfp.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/vmalloc.h>
25 #include <linux/bio.h>
26 #include <linux/sysctl.h>
27 #include <linux/proc_fs.h>
28 #include <linux/workqueue.h>
29 #include <linux/percpu.h>
30 #include <linux/blkdev.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/migrate.h>
34 #include <linux/backing-dev.h>
35 #include <linux/freezer.h>
36 
37 #include "xfs_sb.h"
38 #include "xfs_log.h"
39 #include "xfs_ag.h"
40 #include "xfs_mount.h"
41 #include "xfs_trace.h"
42 
43 static kmem_zone_t *xfs_buf_zone;
44 
45 static struct workqueue_struct *xfslogd_workqueue;
46 
47 #ifdef XFS_BUF_LOCK_TRACKING
48 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
49 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
50 # define XB_GET_OWNER(bp) ((bp)->b_last_holder)
51 #else
52 # define XB_SET_OWNER(bp) do { } while (0)
53 # define XB_CLEAR_OWNER(bp) do { } while (0)
54 # define XB_GET_OWNER(bp) do { } while (0)
55 #endif
56 
57 #define xb_to_gfp(flags) \
58  ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
59 
60 
61 static inline int
62 xfs_buf_is_vmapped(
63  struct xfs_buf *bp)
64 {
65  /*
66  * Return true if the buffer is vmapped.
67  *
68  * b_addr is null if the buffer is not mapped, but the code is clever
69  * enough to know it doesn't have to map a single page, so the check has
70  * to be both for b_addr and bp->b_page_count > 1.
71  */
72  return bp->b_addr && bp->b_page_count > 1;
73 }
74 
75 static inline int
76 xfs_buf_vmap_len(
77  struct xfs_buf *bp)
78 {
79  return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
80 }
81 
82 /*
83  * xfs_buf_lru_add - add a buffer to the LRU.
84  *
85  * The LRU takes a new reference to the buffer so that it will only be freed
86  * once the shrinker takes the buffer off the LRU.
87  */
88 STATIC void
90  struct xfs_buf *bp)
91 {
92  struct xfs_buftarg *btp = bp->b_target;
93 
94  spin_lock(&btp->bt_lru_lock);
95  if (list_empty(&bp->b_lru)) {
96  atomic_inc(&bp->b_hold);
97  list_add_tail(&bp->b_lru, &btp->bt_lru);
98  btp->bt_lru_nr++;
100  }
101  spin_unlock(&btp->bt_lru_lock);
102 }
103 
104 /*
105  * xfs_buf_lru_del - remove a buffer from the LRU
106  *
107  * The unlocked check is safe here because it only occurs when there are not
108  * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
109  * to optimise the shrinker removing the buffer from the LRU and calling
110  * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
111  * bt_lru_lock.
112  */
113 STATIC void
115  struct xfs_buf *bp)
116 {
117  struct xfs_buftarg *btp = bp->b_target;
118 
119  if (list_empty(&bp->b_lru))
120  return;
121 
122  spin_lock(&btp->bt_lru_lock);
123  if (!list_empty(&bp->b_lru)) {
124  list_del_init(&bp->b_lru);
125  btp->bt_lru_nr--;
126  }
127  spin_unlock(&btp->bt_lru_lock);
128 }
129 
130 /*
131  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
132  * b_lru_ref count so that the buffer is freed immediately when the buffer
133  * reference count falls to zero. If the buffer is already on the LRU, we need
134  * to remove the reference that LRU holds on the buffer.
135  *
136  * This prevents build-up of stale buffers on the LRU.
137  */
138 void
140  struct xfs_buf *bp)
141 {
143 
144  bp->b_flags |= XBF_STALE;
145 
146  /*
147  * Clear the delwri status so that a delwri queue walker will not
148  * flush this buffer to disk now that it is stale. The delwri queue has
149  * a reference to the buffer, so this is safe to do.
150  */
151  bp->b_flags &= ~_XBF_DELWRI_Q;
152 
153  atomic_set(&(bp)->b_lru_ref, 0);
154  if (!list_empty(&bp->b_lru)) {
155  struct xfs_buftarg *btp = bp->b_target;
156 
157  spin_lock(&btp->bt_lru_lock);
158  if (!list_empty(&bp->b_lru) &&
159  !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
160  list_del_init(&bp->b_lru);
161  btp->bt_lru_nr--;
162  atomic_dec(&bp->b_hold);
163  }
164  spin_unlock(&btp->bt_lru_lock);
165  }
166  ASSERT(atomic_read(&bp->b_hold) >= 1);
167 }
168 
169 static int
170 xfs_buf_get_maps(
171  struct xfs_buf *bp,
172  int map_count)
173 {
174  ASSERT(bp->b_maps == NULL);
175  bp->b_map_count = map_count;
176 
177  if (map_count == 1) {
178  bp->b_maps = &bp->b_map;
179  return 0;
180  }
181 
182  bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
183  KM_NOFS);
184  if (!bp->b_maps)
185  return ENOMEM;
186  return 0;
187 }
188 
189 /*
190  * Frees b_pages if it was allocated.
191  */
192 static void
193 xfs_buf_free_maps(
194  struct xfs_buf *bp)
195 {
196  if (bp->b_maps != &bp->b_map) {
197  kmem_free(bp->b_maps);
198  bp->b_maps = NULL;
199  }
200 }
201 
202 struct xfs_buf *
204  struct xfs_buftarg *target,
205  struct xfs_buf_map *map,
206  int nmaps,
208 {
209  struct xfs_buf *bp;
210  int error;
211  int i;
212 
213  bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
214  if (unlikely(!bp))
215  return NULL;
216 
217  /*
218  * We don't want certain flags to appear in b_flags unless they are
219  * specifically set by later operations on the buffer.
220  */
222 
223  atomic_set(&bp->b_hold, 1);
224  atomic_set(&bp->b_lru_ref, 1);
225  init_completion(&bp->b_iowait);
226  INIT_LIST_HEAD(&bp->b_lru);
227  INIT_LIST_HEAD(&bp->b_list);
228  RB_CLEAR_NODE(&bp->b_rbnode);
229  sema_init(&bp->b_sema, 0); /* held, no waiters */
230  XB_SET_OWNER(bp);
231  bp->b_target = target;
232  bp->b_flags = flags;
233 
234  /*
235  * Set length and io_length to the same value initially.
236  * I/O routines should use io_length, which will be the same in
237  * most cases but may be reset (e.g. XFS recovery).
238  */
239  error = xfs_buf_get_maps(bp, nmaps);
240  if (error) {
241  kmem_zone_free(xfs_buf_zone, bp);
242  return NULL;
243  }
244 
245  bp->b_bn = map[0].bm_bn;
246  bp->b_length = 0;
247  for (i = 0; i < nmaps; i++) {
248  bp->b_maps[i].bm_bn = map[i].bm_bn;
249  bp->b_maps[i].bm_len = map[i].bm_len;
250  bp->b_length += map[i].bm_len;
251  }
252  bp->b_io_length = bp->b_length;
253 
254  atomic_set(&bp->b_pin_count, 0);
256 
257  XFS_STATS_INC(xb_create);
258  trace_xfs_buf_init(bp, _RET_IP_);
259 
260  return bp;
261 }
262 
263 /*
264  * Allocate a page array capable of holding a specified number
265  * of pages, and point the page buf at it.
266  */
267 STATIC int
269  xfs_buf_t *bp,
270  int page_count,
272 {
273  /* Make sure that we have a page list */
274  if (bp->b_pages == NULL) {
275  bp->b_page_count = page_count;
276  if (page_count <= XB_PAGES) {
277  bp->b_pages = bp->b_page_array;
278  } else {
279  bp->b_pages = kmem_alloc(sizeof(struct page *) *
280  page_count, KM_NOFS);
281  if (bp->b_pages == NULL)
282  return -ENOMEM;
283  }
284  memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
285  }
286  return 0;
287 }
288 
289 /*
290  * Frees b_pages if it was allocated.
291  */
292 STATIC void
294  xfs_buf_t *bp)
295 {
296  if (bp->b_pages != bp->b_page_array) {
297  kmem_free(bp->b_pages);
298  bp->b_pages = NULL;
299  }
300 }
301 
302 /*
303  * Releases the specified buffer.
304  *
305  * The modification state of any associated pages is left unchanged.
306  * The buffer most not be on any hash - use xfs_buf_rele instead for
307  * hashed and refcounted buffers
308  */
309 void
311  xfs_buf_t *bp)
312 {
313  trace_xfs_buf_free(bp, _RET_IP_);
314 
315  ASSERT(list_empty(&bp->b_lru));
316 
317  if (bp->b_flags & _XBF_PAGES) {
318  uint i;
319 
320  if (xfs_buf_is_vmapped(bp))
321  vm_unmap_ram(bp->b_addr - bp->b_offset,
322  bp->b_page_count);
323 
324  for (i = 0; i < bp->b_page_count; i++) {
325  struct page *page = bp->b_pages[i];
326 
327  __free_page(page);
328  }
329  } else if (bp->b_flags & _XBF_KMEM)
330  kmem_free(bp->b_addr);
332  xfs_buf_free_maps(bp);
333  kmem_zone_free(xfs_buf_zone, bp);
334 }
335 
336 /*
337  * Allocates all the pages for buffer in question and builds it's page list.
338  */
339 STATIC int
341  xfs_buf_t *bp,
342  uint flags)
343 {
344  size_t size;
345  size_t nbytes, offset;
346  gfp_t gfp_mask = xb_to_gfp(flags);
347  unsigned short page_count, i;
348  xfs_off_t start, end;
349  int error;
350 
351  /*
352  * for buffers that are contained within a single page, just allocate
353  * the memory from the heap - there's no need for the complexity of
354  * page arrays to keep allocation down to order 0.
355  */
356  size = BBTOB(bp->b_length);
357  if (size < PAGE_SIZE) {
358  bp->b_addr = kmem_alloc(size, KM_NOFS);
359  if (!bp->b_addr) {
360  /* low memory - use alloc_page loop instead */
361  goto use_alloc_page;
362  }
363 
364  if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
365  ((unsigned long)bp->b_addr & PAGE_MASK)) {
366  /* b_addr spans two pages - use alloc_page instead */
367  kmem_free(bp->b_addr);
368  bp->b_addr = NULL;
369  goto use_alloc_page;
370  }
371  bp->b_offset = offset_in_page(bp->b_addr);
372  bp->b_pages = bp->b_page_array;
373  bp->b_pages[0] = virt_to_page(bp->b_addr);
374  bp->b_page_count = 1;
375  bp->b_flags |= _XBF_KMEM;
376  return 0;
377  }
378 
379 use_alloc_page:
380  start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT;
381  end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1)
382  >> PAGE_SHIFT;
383  page_count = end - start;
384  error = _xfs_buf_get_pages(bp, page_count, flags);
385  if (unlikely(error))
386  return error;
387 
388  offset = bp->b_offset;
389  bp->b_flags |= _XBF_PAGES;
390 
391  for (i = 0; i < bp->b_page_count; i++) {
392  struct page *page;
393  uint retries = 0;
394 retry:
395  page = alloc_page(gfp_mask);
396  if (unlikely(page == NULL)) {
397  if (flags & XBF_READ_AHEAD) {
398  bp->b_page_count = i;
399  error = ENOMEM;
400  goto out_free_pages;
401  }
402 
403  /*
404  * This could deadlock.
405  *
406  * But until all the XFS lowlevel code is revamped to
407  * handle buffer allocation failures we can't do much.
408  */
409  if (!(++retries % 100))
410  xfs_err(NULL,
411  "possible memory allocation deadlock in %s (mode:0x%x)",
412  __func__, gfp_mask);
413 
414  XFS_STATS_INC(xb_page_retries);
416  goto retry;
417  }
418 
419  XFS_STATS_INC(xb_page_found);
420 
421  nbytes = min_t(size_t, size, PAGE_SIZE - offset);
422  size -= nbytes;
423  bp->b_pages[i] = page;
424  offset = 0;
425  }
426  return 0;
427 
428 out_free_pages:
429  for (i = 0; i < bp->b_page_count; i++)
430  __free_page(bp->b_pages[i]);
431  return error;
432 }
433 
434 /*
435  * Map buffer into kernel address-space if necessary.
436  */
437 STATIC int
439  xfs_buf_t *bp,
440  uint flags)
441 {
442  ASSERT(bp->b_flags & _XBF_PAGES);
443  if (bp->b_page_count == 1) {
444  /* A single page buffer is always mappable */
445  bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
446  } else if (flags & XBF_UNMAPPED) {
447  bp->b_addr = NULL;
448  } else {
449  int retried = 0;
450 
451  do {
452  bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
453  -1, PAGE_KERNEL);
454  if (bp->b_addr)
455  break;
457  } while (retried++ <= 1);
458 
459  if (!bp->b_addr)
460  return -ENOMEM;
461  bp->b_addr += bp->b_offset;
462  }
463 
464  return 0;
465 }
466 
467 /*
468  * Finding and Reading Buffers
469  */
470 
471 /*
472  * Look up, and creates if absent, a lockable buffer for
473  * a given range of an inode. The buffer is returned
474  * locked. No I/O is implied by this call.
475  */
476 xfs_buf_t *
478  struct xfs_buftarg *btp,
479  struct xfs_buf_map *map,
480  int nmaps,
482  xfs_buf_t *new_bp)
483 {
484  size_t numbytes;
485  struct xfs_perag *pag;
486  struct rb_node **rbp;
487  struct rb_node *parent;
488  xfs_buf_t *bp;
489  xfs_daddr_t blkno = map[0].bm_bn;
490  int numblks = 0;
491  int i;
492 
493  for (i = 0; i < nmaps; i++)
494  numblks += map[i].bm_len;
495  numbytes = BBTOB(numblks);
496 
497  /* Check for IOs smaller than the sector size / not sector aligned */
498  ASSERT(!(numbytes < (1 << btp->bt_sshift)));
499  ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
500 
501  /* get tree root */
502  pag = xfs_perag_get(btp->bt_mount,
503  xfs_daddr_to_agno(btp->bt_mount, blkno));
504 
505  /* walk tree */
506  spin_lock(&pag->pag_buf_lock);
507  rbp = &pag->pag_buf_tree.rb_node;
508  parent = NULL;
509  bp = NULL;
510  while (*rbp) {
511  parent = *rbp;
512  bp = rb_entry(parent, struct xfs_buf, b_rbnode);
513 
514  if (blkno < bp->b_bn)
515  rbp = &(*rbp)->rb_left;
516  else if (blkno > bp->b_bn)
517  rbp = &(*rbp)->rb_right;
518  else {
519  /*
520  * found a block number match. If the range doesn't
521  * match, the only way this is allowed is if the buffer
522  * in the cache is stale and the transaction that made
523  * it stale has not yet committed. i.e. we are
524  * reallocating a busy extent. Skip this buffer and
525  * continue searching to the right for an exact match.
526  */
527  if (bp->b_length != numblks) {
528  ASSERT(bp->b_flags & XBF_STALE);
529  rbp = &(*rbp)->rb_right;
530  continue;
531  }
532  atomic_inc(&bp->b_hold);
533  goto found;
534  }
535  }
536 
537  /* No match found */
538  if (new_bp) {
539  rb_link_node(&new_bp->b_rbnode, parent, rbp);
540  rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
541  /* the buffer keeps the perag reference until it is freed */
542  new_bp->b_pag = pag;
543  spin_unlock(&pag->pag_buf_lock);
544  } else {
545  XFS_STATS_INC(xb_miss_locked);
546  spin_unlock(&pag->pag_buf_lock);
547  xfs_perag_put(pag);
548  }
549  return new_bp;
550 
551 found:
552  spin_unlock(&pag->pag_buf_lock);
553  xfs_perag_put(pag);
554 
555  if (!xfs_buf_trylock(bp)) {
556  if (flags & XBF_TRYLOCK) {
557  xfs_buf_rele(bp);
558  XFS_STATS_INC(xb_busy_locked);
559  return NULL;
560  }
561  xfs_buf_lock(bp);
562  XFS_STATS_INC(xb_get_locked_waited);
563  }
564 
565  /*
566  * if the buffer is stale, clear all the external state associated with
567  * it. We need to keep flags such as how we allocated the buffer memory
568  * intact here.
569  */
570  if (bp->b_flags & XBF_STALE) {
571  ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
572  bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
573  }
574 
575  trace_xfs_buf_find(bp, flags, _RET_IP_);
576  XFS_STATS_INC(xb_get_locked);
577  return bp;
578 }
579 
580 /*
581  * Assembles a buffer covering the specified range. The code is optimised for
582  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
583  * more hits than misses.
584  */
585 struct xfs_buf *
587  struct xfs_buftarg *target,
588  struct xfs_buf_map *map,
589  int nmaps,
591 {
592  struct xfs_buf *bp;
593  struct xfs_buf *new_bp;
594  int error = 0;
595 
596  bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
597  if (likely(bp))
598  goto found;
599 
600  new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
601  if (unlikely(!new_bp))
602  return NULL;
603 
604  error = xfs_buf_allocate_memory(new_bp, flags);
605  if (error) {
606  xfs_buf_free(new_bp);
607  return NULL;
608  }
609 
610  bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
611  if (!bp) {
612  xfs_buf_free(new_bp);
613  return NULL;
614  }
615 
616  if (bp != new_bp)
617  xfs_buf_free(new_bp);
618 
619 found:
620  if (!bp->b_addr) {
621  error = _xfs_buf_map_pages(bp, flags);
622  if (unlikely(error)) {
623  xfs_warn(target->bt_mount,
624  "%s: failed to map pages\n", __func__);
625  xfs_buf_relse(bp);
626  return NULL;
627  }
628  }
629 
630  XFS_STATS_INC(xb_get);
631  trace_xfs_buf_get(bp, flags, _RET_IP_);
632  return bp;
633 }
634 
635 STATIC int
637  xfs_buf_t *bp,
639 {
640  ASSERT(!(flags & XBF_WRITE));
641  ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL);
642 
643  bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
644  bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
645 
646  xfs_buf_iorequest(bp);
647  if (flags & XBF_ASYNC)
648  return 0;
649  return xfs_buf_iowait(bp);
650 }
651 
652 xfs_buf_t *
654  struct xfs_buftarg *target,
655  struct xfs_buf_map *map,
656  int nmaps,
658 {
659  struct xfs_buf *bp;
660 
661  flags |= XBF_READ;
662 
663  bp = xfs_buf_get_map(target, map, nmaps, flags);
664  if (bp) {
665  trace_xfs_buf_read(bp, flags, _RET_IP_);
666 
667  if (!XFS_BUF_ISDONE(bp)) {
668  XFS_STATS_INC(xb_get_read);
669  _xfs_buf_read(bp, flags);
670  } else if (flags & XBF_ASYNC) {
671  /*
672  * Read ahead call which is already satisfied,
673  * drop the buffer
674  */
675  xfs_buf_relse(bp);
676  return NULL;
677  } else {
678  /* We do not want read in the flags */
679  bp->b_flags &= ~XBF_READ;
680  }
681  }
682 
683  return bp;
684 }
685 
686 /*
687  * If we are not low on memory then do the readahead in a deadlock
688  * safe manner.
689  */
690 void
692  struct xfs_buftarg *target,
693  struct xfs_buf_map *map,
694  int nmaps)
695 {
696  if (bdi_read_congested(target->bt_bdi))
697  return;
698 
699  xfs_buf_read_map(target, map, nmaps,
701 }
702 
703 /*
704  * Read an uncached buffer from disk. Allocates and returns a locked
705  * buffer containing the disk contents or nothing.
706  */
707 struct xfs_buf *
709  struct xfs_buftarg *target,
710  xfs_daddr_t daddr,
711  size_t numblks,
712  int flags)
713 {
714  xfs_buf_t *bp;
715  int error;
716 
717  bp = xfs_buf_get_uncached(target, numblks, flags);
718  if (!bp)
719  return NULL;
720 
721  /* set up the buffer for a read IO */
722  ASSERT(bp->b_map_count == 1);
723  bp->b_bn = daddr;
724  bp->b_maps[0].bm_bn = daddr;
725  bp->b_flags |= XBF_READ;
726 
727  xfsbdstrat(target->bt_mount, bp);
728  error = xfs_buf_iowait(bp);
729  if (error) {
730  xfs_buf_relse(bp);
731  return NULL;
732  }
733  return bp;
734 }
735 
736 /*
737  * Return a buffer allocated as an empty buffer and associated to external
738  * memory via xfs_buf_associate_memory() back to it's empty state.
739  */
740 void
742  struct xfs_buf *bp,
743  size_t numblks)
744 {
745  if (bp->b_pages)
747 
748  bp->b_pages = NULL;
749  bp->b_page_count = 0;
750  bp->b_addr = NULL;
751  bp->b_length = numblks;
752  bp->b_io_length = numblks;
753 
754  ASSERT(bp->b_map_count == 1);
755  bp->b_bn = XFS_BUF_DADDR_NULL;
756  bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
757  bp->b_maps[0].bm_len = bp->b_length;
758 }
759 
760 static inline struct page *
761 mem_to_page(
762  void *addr)
763 {
764  if ((!is_vmalloc_addr(addr))) {
765  return virt_to_page(addr);
766  } else {
767  return vmalloc_to_page(addr);
768  }
769 }
770 
771 int
773  xfs_buf_t *bp,
774  void *mem,
775  size_t len)
776 {
777  int rval;
778  int i = 0;
779  unsigned long pageaddr;
780  unsigned long offset;
781  size_t buflen;
782  int page_count;
783 
784  pageaddr = (unsigned long)mem & PAGE_MASK;
785  offset = (unsigned long)mem - pageaddr;
786  buflen = PAGE_ALIGN(len + offset);
787  page_count = buflen >> PAGE_SHIFT;
788 
789  /* Free any previous set of page pointers */
790  if (bp->b_pages)
792 
793  bp->b_pages = NULL;
794  bp->b_addr = mem;
795 
796  rval = _xfs_buf_get_pages(bp, page_count, 0);
797  if (rval)
798  return rval;
799 
800  bp->b_offset = offset;
801 
802  for (i = 0; i < bp->b_page_count; i++) {
803  bp->b_pages[i] = mem_to_page((void *)pageaddr);
804  pageaddr += PAGE_SIZE;
805  }
806 
807  bp->b_io_length = BTOBB(len);
808  bp->b_length = BTOBB(buflen);
809 
810  return 0;
811 }
812 
813 xfs_buf_t *
815  struct xfs_buftarg *target,
816  size_t numblks,
817  int flags)
818 {
819  unsigned long page_count;
820  int error, i;
821  struct xfs_buf *bp;
823 
824  bp = _xfs_buf_alloc(target, &map, 1, 0);
825  if (unlikely(bp == NULL))
826  goto fail;
827 
828  page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
829  error = _xfs_buf_get_pages(bp, page_count, 0);
830  if (error)
831  goto fail_free_buf;
832 
833  for (i = 0; i < page_count; i++) {
834  bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
835  if (!bp->b_pages[i])
836  goto fail_free_mem;
837  }
838  bp->b_flags |= _XBF_PAGES;
839 
840  error = _xfs_buf_map_pages(bp, 0);
841  if (unlikely(error)) {
842  xfs_warn(target->bt_mount,
843  "%s: failed to map pages\n", __func__);
844  goto fail_free_mem;
845  }
846 
847  trace_xfs_buf_get_uncached(bp, _RET_IP_);
848  return bp;
849 
850  fail_free_mem:
851  while (--i >= 0)
852  __free_page(bp->b_pages[i]);
854  fail_free_buf:
855  xfs_buf_free_maps(bp);
856  kmem_zone_free(xfs_buf_zone, bp);
857  fail:
858  return NULL;
859 }
860 
861 /*
862  * Increment reference count on buffer, to hold the buffer concurrently
863  * with another thread which may release (free) the buffer asynchronously.
864  * Must hold the buffer already to call this function.
865  */
866 void
868  xfs_buf_t *bp)
869 {
870  trace_xfs_buf_hold(bp, _RET_IP_);
871  atomic_inc(&bp->b_hold);
872 }
873 
874 /*
875  * Releases a hold on the specified buffer. If the
876  * the hold count is 1, calls xfs_buf_free.
877  */
878 void
880  xfs_buf_t *bp)
881 {
882  struct xfs_perag *pag = bp->b_pag;
883 
884  trace_xfs_buf_rele(bp, _RET_IP_);
885 
886  if (!pag) {
887  ASSERT(list_empty(&bp->b_lru));
889  if (atomic_dec_and_test(&bp->b_hold))
890  xfs_buf_free(bp);
891  return;
892  }
893 
894  ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
895 
896  ASSERT(atomic_read(&bp->b_hold) > 0);
897  if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
898  if (!(bp->b_flags & XBF_STALE) &&
899  atomic_read(&bp->b_lru_ref)) {
900  xfs_buf_lru_add(bp);
901  spin_unlock(&pag->pag_buf_lock);
902  } else {
903  xfs_buf_lru_del(bp);
904  ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
905  rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
906  spin_unlock(&pag->pag_buf_lock);
907  xfs_perag_put(pag);
908  xfs_buf_free(bp);
909  }
910  }
911 }
912 
913 
914 /*
915  * Lock a buffer object, if it is not already locked.
916  *
917  * If we come across a stale, pinned, locked buffer, we know that we are
918  * being asked to lock a buffer that has been reallocated. Because it is
919  * pinned, we know that the log has not been pushed to disk and hence it
920  * will still be locked. Rather than continuing to have trylock attempts
921  * fail until someone else pushes the log, push it ourselves before
922  * returning. This means that the xfsaild will not get stuck trying
923  * to push on stale inode buffers.
924  */
925 int
927  struct xfs_buf *bp)
928 {
929  int locked;
930 
931  locked = down_trylock(&bp->b_sema) == 0;
932  if (locked)
933  XB_SET_OWNER(bp);
934  else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
936 
937  trace_xfs_buf_trylock(bp, _RET_IP_);
938  return locked;
939 }
940 
941 /*
942  * Lock a buffer object.
943  *
944  * If we come across a stale, pinned, locked buffer, we know that we
945  * are being asked to lock a buffer that has been reallocated. Because
946  * it is pinned, we know that the log has not been pushed to disk and
947  * hence it will still be locked. Rather than sleeping until someone
948  * else pushes the log, push it ourselves before trying to get the lock.
949  */
950 void
952  struct xfs_buf *bp)
953 {
954  trace_xfs_buf_lock(bp, _RET_IP_);
955 
956  if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
958  down(&bp->b_sema);
959  XB_SET_OWNER(bp);
960 
961  trace_xfs_buf_lock_done(bp, _RET_IP_);
962 }
963 
964 void
966  struct xfs_buf *bp)
967 {
968  XB_CLEAR_OWNER(bp);
969  up(&bp->b_sema);
970 
971  trace_xfs_buf_unlock(bp, _RET_IP_);
972 }
973 
974 STATIC void
976  xfs_buf_t *bp)
977 {
979 
980  if (atomic_read(&bp->b_pin_count) == 0)
981  return;
982 
983  add_wait_queue(&bp->b_waiters, &wait);
984  for (;;) {
986  if (atomic_read(&bp->b_pin_count) == 0)
987  break;
988  io_schedule();
989  }
992 }
993 
994 /*
995  * Buffer Utility Routines
996  */
997 
998 STATIC void
1000  struct work_struct *work)
1001 {
1002  xfs_buf_t *bp =
1003  container_of(work, xfs_buf_t, b_iodone_work);
1004 
1005  if (bp->b_iodone)
1006  (*(bp->b_iodone))(bp);
1007  else if (bp->b_flags & XBF_ASYNC)
1008  xfs_buf_relse(bp);
1009 }
1010 
1011 void
1013  xfs_buf_t *bp,
1014  int schedule)
1015 {
1016  trace_xfs_buf_iodone(bp, _RET_IP_);
1017 
1018  bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1019  if (bp->b_error == 0)
1020  bp->b_flags |= XBF_DONE;
1021 
1022  if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1023  if (schedule) {
1025  queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1026  } else {
1028  }
1029  } else {
1030  complete(&bp->b_iowait);
1031  }
1032 }
1033 
1034 void
1036  xfs_buf_t *bp,
1037  int error)
1038 {
1039  ASSERT(error >= 0 && error <= 0xffff);
1040  bp->b_error = (unsigned short)error;
1041  trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1042 }
1043 
1044 void
1046  struct xfs_buf *bp,
1047  const char *func)
1048 {
1049  xfs_alert(bp->b_target->bt_mount,
1050 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
1051  (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
1052 }
1053 
1054 /*
1055  * Called when we want to stop a buffer from getting written or read.
1056  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1057  * so that the proper iodone callbacks get called.
1058  */
1059 STATIC int
1061  xfs_buf_t *bp)
1062 {
1063 #ifdef XFSERRORDEBUG
1064  ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1065 #endif
1066 
1067  /*
1068  * No need to wait until the buffer is unpinned, we aren't flushing it.
1069  */
1070  xfs_buf_ioerror(bp, EIO);
1071 
1072  /*
1073  * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1074  */
1075  XFS_BUF_UNREAD(bp);
1076  XFS_BUF_UNDONE(bp);
1077  xfs_buf_stale(bp);
1078 
1079  xfs_buf_ioend(bp, 0);
1080 
1081  return EIO;
1082 }
1083 
1084 /*
1085  * Same as xfs_bioerror, except that we are releasing the buffer
1086  * here ourselves, and avoiding the xfs_buf_ioend call.
1087  * This is meant for userdata errors; metadata bufs come with
1088  * iodone functions attached, so that we can track down errors.
1089  */
1090 STATIC int
1092  struct xfs_buf *bp)
1093 {
1094  int64_t fl = bp->b_flags;
1095  /*
1096  * No need to wait until the buffer is unpinned.
1097  * We aren't flushing it.
1098  *
1099  * chunkhold expects B_DONE to be set, whether
1100  * we actually finish the I/O or not. We don't want to
1101  * change that interface.
1102  */
1103  XFS_BUF_UNREAD(bp);
1104  XFS_BUF_DONE(bp);
1105  xfs_buf_stale(bp);
1106  bp->b_iodone = NULL;
1107  if (!(fl & XBF_ASYNC)) {
1108  /*
1109  * Mark b_error and B_ERROR _both_.
1110  * Lot's of chunkcache code assumes that.
1111  * There's no reason to mark error for
1112  * ASYNC buffers.
1113  */
1114  xfs_buf_ioerror(bp, EIO);
1115  complete(&bp->b_iowait);
1116  } else {
1117  xfs_buf_relse(bp);
1118  }
1119 
1120  return EIO;
1121 }
1122 
1123 STATIC int
1125  struct xfs_buf *bp)
1126 {
1127  if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1128  trace_xfs_bdstrat_shut(bp, _RET_IP_);
1129  /*
1130  * Metadata write that didn't get logged but
1131  * written delayed anyway. These aren't associated
1132  * with a transaction, and can be ignored.
1133  */
1134  if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1135  return xfs_bioerror_relse(bp);
1136  else
1137  return xfs_bioerror(bp);
1138  }
1139 
1140  xfs_buf_iorequest(bp);
1141  return 0;
1142 }
1143 
1144 int
1146  struct xfs_buf *bp)
1147 {
1148  int error;
1149 
1150  ASSERT(xfs_buf_islocked(bp));
1151 
1152  bp->b_flags |= XBF_WRITE;
1153  bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
1154 
1155  xfs_bdstrat_cb(bp);
1156 
1157  error = xfs_buf_iowait(bp);
1158  if (error) {
1159  xfs_force_shutdown(bp->b_target->bt_mount,
1160  SHUTDOWN_META_IO_ERROR);
1161  }
1162  return error;
1163 }
1164 
1165 /*
1166  * Wrapper around bdstrat so that we can stop data from going to disk in case
1167  * we are shutting down the filesystem. Typically user data goes thru this
1168  * path; one of the exceptions is the superblock.
1169  */
1170 void
1172  struct xfs_mount *mp,
1173  struct xfs_buf *bp)
1174 {
1175  if (XFS_FORCED_SHUTDOWN(mp)) {
1176  trace_xfs_bdstrat_shut(bp, _RET_IP_);
1177  xfs_bioerror_relse(bp);
1178  return;
1179  }
1180 
1181  xfs_buf_iorequest(bp);
1182 }
1183 
1184 STATIC void
1186  xfs_buf_t *bp,
1187  int schedule)
1188 {
1189  if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1190  xfs_buf_ioend(bp, schedule);
1191 }
1192 
1193 STATIC void
1195  struct bio *bio,
1196  int error)
1197 {
1198  xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1199 
1200  /*
1201  * don't overwrite existing errors - otherwise we can lose errors on
1202  * buffers that require multiple bios to complete.
1203  */
1204  if (!bp->b_error)
1205  xfs_buf_ioerror(bp, -error);
1206 
1207  if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1208  invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1209 
1210  _xfs_buf_ioend(bp, 1);
1211  bio_put(bio);
1212 }
1213 
1214 static void
1215 xfs_buf_ioapply_map(
1216  struct xfs_buf *bp,
1217  int map,
1218  int *buf_offset,
1219  int *count,
1220  int rw)
1221 {
1222  int page_index;
1223  int total_nr_pages = bp->b_page_count;
1224  int nr_pages;
1225  struct bio *bio;
1226  sector_t sector = bp->b_maps[map].bm_bn;
1227  int size;
1228  int offset;
1229 
1230  total_nr_pages = bp->b_page_count;
1231 
1232  /* skip the pages in the buffer before the start offset */
1233  page_index = 0;
1234  offset = *buf_offset;
1235  while (offset >= PAGE_SIZE) {
1236  page_index++;
1237  offset -= PAGE_SIZE;
1238  }
1239 
1240  /*
1241  * Limit the IO size to the length of the current vector, and update the
1242  * remaining IO count for the next time around.
1243  */
1244  size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1245  *count -= size;
1246  *buf_offset += size;
1247 
1248 next_chunk:
1249  atomic_inc(&bp->b_io_remaining);
1250  nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1251  if (nr_pages > total_nr_pages)
1252  nr_pages = total_nr_pages;
1253 
1254  bio = bio_alloc(GFP_NOIO, nr_pages);
1255  bio->bi_bdev = bp->b_target->bt_bdev;
1256  bio->bi_sector = sector;
1257  bio->bi_end_io = xfs_buf_bio_end_io;
1258  bio->bi_private = bp;
1259 
1260 
1261  for (; size && nr_pages; nr_pages--, page_index++) {
1262  int rbytes, nbytes = PAGE_SIZE - offset;
1263 
1264  if (nbytes > size)
1265  nbytes = size;
1266 
1267  rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1268  offset);
1269  if (rbytes < nbytes)
1270  break;
1271 
1272  offset = 0;
1273  sector += BTOBB(nbytes);
1274  size -= nbytes;
1275  total_nr_pages--;
1276  }
1277 
1278  if (likely(bio->bi_size)) {
1279  if (xfs_buf_is_vmapped(bp)) {
1280  flush_kernel_vmap_range(bp->b_addr,
1281  xfs_buf_vmap_len(bp));
1282  }
1283  submit_bio(rw, bio);
1284  if (size)
1285  goto next_chunk;
1286  } else {
1287  /*
1288  * This is guaranteed not to be the last io reference count
1289  * because the caller (xfs_buf_iorequest) holds a count itself.
1290  */
1291  atomic_dec(&bp->b_io_remaining);
1292  xfs_buf_ioerror(bp, EIO);
1293  bio_put(bio);
1294  }
1295 
1296 }
1297 
1298 STATIC void
1300  struct xfs_buf *bp)
1301 {
1302  struct blk_plug plug;
1303  int rw;
1304  int offset;
1305  int size;
1306  int i;
1307 
1308  if (bp->b_flags & XBF_WRITE) {
1309  if (bp->b_flags & XBF_SYNCIO)
1310  rw = WRITE_SYNC;
1311  else
1312  rw = WRITE;
1313  if (bp->b_flags & XBF_FUA)
1314  rw |= REQ_FUA;
1315  if (bp->b_flags & XBF_FLUSH)
1316  rw |= REQ_FLUSH;
1317  } else if (bp->b_flags & XBF_READ_AHEAD) {
1318  rw = READA;
1319  } else {
1320  rw = READ;
1321  }
1322 
1323  /* we only use the buffer cache for meta-data */
1324  rw |= REQ_META;
1325 
1326  /*
1327  * Walk all the vectors issuing IO on them. Set up the initial offset
1328  * into the buffer and the desired IO size before we start -
1329  * _xfs_buf_ioapply_vec() will modify them appropriately for each
1330  * subsequent call.
1331  */
1332  offset = bp->b_offset;
1333  size = BBTOB(bp->b_io_length);
1334  blk_start_plug(&plug);
1335  for (i = 0; i < bp->b_map_count; i++) {
1336  xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
1337  if (bp->b_error)
1338  break;
1339  if (size <= 0)
1340  break; /* all done */
1341  }
1342  blk_finish_plug(&plug);
1343 }
1344 
1345 void
1347  xfs_buf_t *bp)
1348 {
1349  trace_xfs_buf_iorequest(bp, _RET_IP_);
1350 
1351  ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1352 
1353  if (bp->b_flags & XBF_WRITE)
1354  xfs_buf_wait_unpin(bp);
1355  xfs_buf_hold(bp);
1356 
1357  /* Set the count to 1 initially, this will stop an I/O
1358  * completion callout which happens before we have started
1359  * all the I/O from calling xfs_buf_ioend too early.
1360  */
1361  atomic_set(&bp->b_io_remaining, 1);
1362  _xfs_buf_ioapply(bp);
1363  _xfs_buf_ioend(bp, 1);
1364 
1365  xfs_buf_rele(bp);
1366 }
1367 
1368 /*
1369  * Waits for I/O to complete on the buffer supplied. It returns immediately if
1370  * no I/O is pending or there is already a pending error on the buffer. It
1371  * returns the I/O error code, if any, or 0 if there was no error.
1372  */
1373 int
1375  xfs_buf_t *bp)
1376 {
1377  trace_xfs_buf_iowait(bp, _RET_IP_);
1378 
1379  if (!bp->b_error)
1381 
1382  trace_xfs_buf_iowait_done(bp, _RET_IP_);
1383  return bp->b_error;
1384 }
1385 
1386 xfs_caddr_t
1388  xfs_buf_t *bp,
1389  size_t offset)
1390 {
1391  struct page *page;
1392 
1393  if (bp->b_addr)
1394  return bp->b_addr + offset;
1395 
1396  offset += bp->b_offset;
1397  page = bp->b_pages[offset >> PAGE_SHIFT];
1398  return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1399 }
1400 
1401 /*
1402  * Move data into or out of a buffer.
1403  */
1404 void
1406  xfs_buf_t *bp, /* buffer to process */
1407  size_t boff, /* starting buffer offset */
1408  size_t bsize, /* length to copy */
1409  void *data, /* data address */
1410  xfs_buf_rw_t mode) /* read/write/zero flag */
1411 {
1412  size_t bend;
1413 
1414  bend = boff + bsize;
1415  while (boff < bend) {
1416  struct page *page;
1417  int page_index, page_offset, csize;
1418 
1419  page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1420  page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1421  page = bp->b_pages[page_index];
1422  csize = min_t(size_t, PAGE_SIZE - page_offset,
1423  BBTOB(bp->b_io_length) - boff);
1424 
1425  ASSERT((csize + page_offset) <= PAGE_SIZE);
1426 
1427  switch (mode) {
1428  case XBRW_ZERO:
1429  memset(page_address(page) + page_offset, 0, csize);
1430  break;
1431  case XBRW_READ:
1432  memcpy(data, page_address(page) + page_offset, csize);
1433  break;
1434  case XBRW_WRITE:
1435  memcpy(page_address(page) + page_offset, data, csize);
1436  }
1437 
1438  boff += csize;
1439  data += csize;
1440  }
1441 }
1442 
1443 /*
1444  * Handling of buffer targets (buftargs).
1445  */
1446 
1447 /*
1448  * Wait for any bufs with callbacks that have been submitted but have not yet
1449  * returned. These buffers will have an elevated hold count, so wait on those
1450  * while freeing all the buffers only held by the LRU.
1451  */
1452 void
1454  struct xfs_buftarg *btp)
1455 {
1456  struct xfs_buf *bp;
1457 
1458 restart:
1459  spin_lock(&btp->bt_lru_lock);
1460  while (!list_empty(&btp->bt_lru)) {
1461  bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1462  if (atomic_read(&bp->b_hold) > 1) {
1463  spin_unlock(&btp->bt_lru_lock);
1464  delay(100);
1465  goto restart;
1466  }
1467  /*
1468  * clear the LRU reference count so the buffer doesn't get
1469  * ignored in xfs_buf_rele().
1470  */
1471  atomic_set(&bp->b_lru_ref, 0);
1472  spin_unlock(&btp->bt_lru_lock);
1473  xfs_buf_rele(bp);
1474  spin_lock(&btp->bt_lru_lock);
1475  }
1476  spin_unlock(&btp->bt_lru_lock);
1477 }
1478 
1479 int
1481  struct shrinker *shrink,
1482  struct shrink_control *sc)
1483 {
1484  struct xfs_buftarg *btp = container_of(shrink,
1485  struct xfs_buftarg, bt_shrinker);
1486  struct xfs_buf *bp;
1487  int nr_to_scan = sc->nr_to_scan;
1488  LIST_HEAD(dispose);
1489 
1490  if (!nr_to_scan)
1491  return btp->bt_lru_nr;
1492 
1493  spin_lock(&btp->bt_lru_lock);
1494  while (!list_empty(&btp->bt_lru)) {
1495  if (nr_to_scan-- <= 0)
1496  break;
1497 
1498  bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1499 
1500  /*
1501  * Decrement the b_lru_ref count unless the value is already
1502  * zero. If the value is already zero, we need to reclaim the
1503  * buffer, otherwise it gets another trip through the LRU.
1504  */
1505  if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1506  list_move_tail(&bp->b_lru, &btp->bt_lru);
1507  continue;
1508  }
1509 
1510  /*
1511  * remove the buffer from the LRU now to avoid needing another
1512  * lock round trip inside xfs_buf_rele().
1513  */
1514  list_move(&bp->b_lru, &dispose);
1515  btp->bt_lru_nr--;
1517  }
1518  spin_unlock(&btp->bt_lru_lock);
1519 
1520  while (!list_empty(&dispose)) {
1521  bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1522  list_del_init(&bp->b_lru);
1523  xfs_buf_rele(bp);
1524  }
1525 
1526  return btp->bt_lru_nr;
1527 }
1528 
1529 void
1531  struct xfs_mount *mp,
1532  struct xfs_buftarg *btp)
1533 {
1535 
1536  if (mp->m_flags & XFS_MOUNT_BARRIER)
1538 
1539  kmem_free(btp);
1540 }
1541 
1542 STATIC int
1544  xfs_buftarg_t *btp,
1545  unsigned int blocksize,
1546  unsigned int sectorsize,
1547  int verbose)
1548 {
1549  btp->bt_bsize = blocksize;
1550  btp->bt_sshift = ffs(sectorsize) - 1;
1551  btp->bt_smask = sectorsize - 1;
1552 
1553  if (set_blocksize(btp->bt_bdev, sectorsize)) {
1554  char name[BDEVNAME_SIZE];
1555 
1556  bdevname(btp->bt_bdev, name);
1557 
1558  xfs_warn(btp->bt_mount,
1559  "Cannot set_blocksize to %u on device %s\n",
1560  sectorsize, name);
1561  return EINVAL;
1562  }
1563 
1564  return 0;
1565 }
1566 
1567 /*
1568  * When allocating the initial buffer target we have not yet
1569  * read in the superblock, so don't know what sized sectors
1570  * are being used is at this early stage. Play safe.
1571  */
1572 STATIC int
1574  xfs_buftarg_t *btp,
1575  struct block_device *bdev)
1576 {
1577  return xfs_setsize_buftarg_flags(btp,
1578  PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1579 }
1580 
1581 int
1583  xfs_buftarg_t *btp,
1584  unsigned int blocksize,
1585  unsigned int sectorsize)
1586 {
1587  return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1588 }
1589 
1590 xfs_buftarg_t *
1592  struct xfs_mount *mp,
1593  struct block_device *bdev,
1594  int external,
1595  const char *fsname)
1596 {
1597  xfs_buftarg_t *btp;
1598 
1599  btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1600 
1601  btp->bt_mount = mp;
1602  btp->bt_dev = bdev->bd_dev;
1603  btp->bt_bdev = bdev;
1604  btp->bt_bdi = blk_get_backing_dev_info(bdev);
1605  if (!btp->bt_bdi)
1606  goto error;
1607 
1608  INIT_LIST_HEAD(&btp->bt_lru);
1609  spin_lock_init(&btp->bt_lru_lock);
1610  if (xfs_setsize_buftarg_early(btp, bdev))
1611  goto error;
1612  btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1613  btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1615  return btp;
1616 
1617 error:
1618  kmem_free(btp);
1619  return NULL;
1620 }
1621 
1622 /*
1623  * Add a buffer to the delayed write list.
1624  *
1625  * This queues a buffer for writeout if it hasn't already been. Note that
1626  * neither this routine nor the buffer list submission functions perform
1627  * any internal synchronization. It is expected that the lists are thread-local
1628  * to the callers.
1629  *
1630  * Returns true if we queued up the buffer, or false if it already had
1631  * been on the buffer list.
1632  */
1633 bool
1635  struct xfs_buf *bp,
1636  struct list_head *list)
1637 {
1638  ASSERT(xfs_buf_islocked(bp));
1639  ASSERT(!(bp->b_flags & XBF_READ));
1640 
1641  /*
1642  * If the buffer is already marked delwri it already is queued up
1643  * by someone else for imediate writeout. Just ignore it in that
1644  * case.
1645  */
1646  if (bp->b_flags & _XBF_DELWRI_Q) {
1647  trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1648  return false;
1649  }
1650 
1651  trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1652 
1653  /*
1654  * If a buffer gets written out synchronously or marked stale while it
1655  * is on a delwri list we lazily remove it. To do this, the other party
1656  * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1657  * It remains referenced and on the list. In a rare corner case it
1658  * might get readded to a delwri list after the synchronous writeout, in
1659  * which case we need just need to re-add the flag here.
1660  */
1661  bp->b_flags |= _XBF_DELWRI_Q;
1662  if (list_empty(&bp->b_list)) {
1663  atomic_inc(&bp->b_hold);
1664  list_add_tail(&bp->b_list, list);
1665  }
1666 
1667  return true;
1668 }
1669 
1670 /*
1671  * Compare function is more complex than it needs to be because
1672  * the return value is only 32 bits and we are doing comparisons
1673  * on 64 bit values
1674  */
1675 static int
1676 xfs_buf_cmp(
1677  void *priv,
1678  struct list_head *a,
1679  struct list_head *b)
1680 {
1681  struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1682  struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1683  xfs_daddr_t diff;
1684 
1685  diff = ap->b_map.bm_bn - bp->b_map.bm_bn;
1686  if (diff < 0)
1687  return -1;
1688  if (diff > 0)
1689  return 1;
1690  return 0;
1691 }
1692 
1693 static int
1694 __xfs_buf_delwri_submit(
1695  struct list_head *buffer_list,
1696  struct list_head *io_list,
1697  bool wait)
1698 {
1699  struct blk_plug plug;
1700  struct xfs_buf *bp, *n;
1701  int pinned = 0;
1702 
1703  list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1704  if (!wait) {
1705  if (xfs_buf_ispinned(bp)) {
1706  pinned++;
1707  continue;
1708  }
1709  if (!xfs_buf_trylock(bp))
1710  continue;
1711  } else {
1712  xfs_buf_lock(bp);
1713  }
1714 
1715  /*
1716  * Someone else might have written the buffer synchronously or
1717  * marked it stale in the meantime. In that case only the
1718  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1719  * reference and remove it from the list here.
1720  */
1721  if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1722  list_del_init(&bp->b_list);
1723  xfs_buf_relse(bp);
1724  continue;
1725  }
1726 
1727  list_move_tail(&bp->b_list, io_list);
1728  trace_xfs_buf_delwri_split(bp, _RET_IP_);
1729  }
1730 
1731  list_sort(NULL, io_list, xfs_buf_cmp);
1732 
1733  blk_start_plug(&plug);
1734  list_for_each_entry_safe(bp, n, io_list, b_list) {
1735  bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
1736  bp->b_flags |= XBF_WRITE;
1737 
1738  if (!wait) {
1739  bp->b_flags |= XBF_ASYNC;
1740  list_del_init(&bp->b_list);
1741  }
1742  xfs_bdstrat_cb(bp);
1743  }
1744  blk_finish_plug(&plug);
1745 
1746  return pinned;
1747 }
1748 
1749 /*
1750  * Write out a buffer list asynchronously.
1751  *
1752  * This will take the @buffer_list, write all non-locked and non-pinned buffers
1753  * out and not wait for I/O completion on any of the buffers. This interface
1754  * is only safely useable for callers that can track I/O completion by higher
1755  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1756  * function.
1757  */
1758 int
1760  struct list_head *buffer_list)
1761 {
1762  LIST_HEAD (io_list);
1763  return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1764 }
1765 
1766 /*
1767  * Write out a buffer list synchronously.
1768  *
1769  * This will take the @buffer_list, write all buffers out and wait for I/O
1770  * completion on all of the buffers. @buffer_list is consumed by the function,
1771  * so callers must have some other way of tracking buffers if they require such
1772  * functionality.
1773  */
1774 int
1776  struct list_head *buffer_list)
1777 {
1778  LIST_HEAD (io_list);
1779  int error = 0, error2;
1780  struct xfs_buf *bp;
1781 
1782  __xfs_buf_delwri_submit(buffer_list, &io_list, true);
1783 
1784  /* Wait for IO to complete. */
1785  while (!list_empty(&io_list)) {
1786  bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1787 
1788  list_del_init(&bp->b_list);
1789  error2 = xfs_buf_iowait(bp);
1790  xfs_buf_relse(bp);
1791  if (!error)
1792  error = error2;
1793  }
1794 
1795  return error;
1796 }
1797 
1798 int __init
1800 {
1801  xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1803  if (!xfs_buf_zone)
1804  goto out;
1805 
1806  xfslogd_workqueue = alloc_workqueue("xfslogd",
1807  WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1808  if (!xfslogd_workqueue)
1809  goto out_free_buf_zone;
1810 
1811  return 0;
1812 
1813  out_free_buf_zone:
1814  kmem_zone_destroy(xfs_buf_zone);
1815  out:
1816  return -ENOMEM;
1817 }
1818 
1819 void
1821 {
1822  destroy_workqueue(xfslogd_workqueue);
1823  kmem_zone_destroy(xfs_buf_zone);
1824 }