Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xfs_mount.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_trans.h"
25 #include "xfs_trans_priv.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_mount.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_alloc_btree.h"
32 #include "xfs_ialloc_btree.h"
33 #include "xfs_dinode.h"
34 #include "xfs_inode.h"
35 #include "xfs_btree.h"
36 #include "xfs_ialloc.h"
37 #include "xfs_alloc.h"
38 #include "xfs_rtalloc.h"
39 #include "xfs_bmap.h"
40 #include "xfs_error.h"
41 #include "xfs_quota.h"
42 #include "xfs_fsops.h"
43 #include "xfs_utils.h"
44 #include "xfs_trace.h"
45 
46 
47 #ifdef HAVE_PERCPU_SB
49  int);
51  int);
52 STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
53 #else
54 
55 #define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
56 #define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
57 #endif
58 
59 static const struct {
60  short offset;
61  short type; /* 0 = integer
62  * 1 = binary / string (no translation)
63  */
64 } xfs_sb_info[] = {
65  { offsetof(xfs_sb_t, sb_magicnum), 0 },
67  { offsetof(xfs_sb_t, sb_dblocks), 0 },
68  { offsetof(xfs_sb_t, sb_rblocks), 0 },
69  { offsetof(xfs_sb_t, sb_rextents), 0 },
70  { offsetof(xfs_sb_t, sb_uuid), 1 },
71  { offsetof(xfs_sb_t, sb_logstart), 0 },
72  { offsetof(xfs_sb_t, sb_rootino), 0 },
73  { offsetof(xfs_sb_t, sb_rbmino), 0 },
74  { offsetof(xfs_sb_t, sb_rsumino), 0 },
75  { offsetof(xfs_sb_t, sb_rextsize), 0 },
76  { offsetof(xfs_sb_t, sb_agblocks), 0 },
77  { offsetof(xfs_sb_t, sb_agcount), 0 },
78  { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
79  { offsetof(xfs_sb_t, sb_logblocks), 0 },
80  { offsetof(xfs_sb_t, sb_versionnum), 0 },
81  { offsetof(xfs_sb_t, sb_sectsize), 0 },
82  { offsetof(xfs_sb_t, sb_inodesize), 0 },
83  { offsetof(xfs_sb_t, sb_inopblock), 0 },
84  { offsetof(xfs_sb_t, sb_fname[0]), 1 },
85  { offsetof(xfs_sb_t, sb_blocklog), 0 },
86  { offsetof(xfs_sb_t, sb_sectlog), 0 },
87  { offsetof(xfs_sb_t, sb_inodelog), 0 },
88  { offsetof(xfs_sb_t, sb_inopblog), 0 },
89  { offsetof(xfs_sb_t, sb_agblklog), 0 },
90  { offsetof(xfs_sb_t, sb_rextslog), 0 },
91  { offsetof(xfs_sb_t, sb_inprogress), 0 },
92  { offsetof(xfs_sb_t, sb_imax_pct), 0 },
93  { offsetof(xfs_sb_t, sb_icount), 0 },
94  { offsetof(xfs_sb_t, sb_ifree), 0 },
95  { offsetof(xfs_sb_t, sb_fdblocks), 0 },
96  { offsetof(xfs_sb_t, sb_frextents), 0 },
97  { offsetof(xfs_sb_t, sb_uquotino), 0 },
98  { offsetof(xfs_sb_t, sb_gquotino), 0 },
99  { offsetof(xfs_sb_t, sb_qflags), 0 },
100  { offsetof(xfs_sb_t, sb_flags), 0 },
101  { offsetof(xfs_sb_t, sb_shared_vn), 0 },
102  { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
103  { offsetof(xfs_sb_t, sb_unit), 0 },
104  { offsetof(xfs_sb_t, sb_width), 0 },
105  { offsetof(xfs_sb_t, sb_dirblklog), 0 },
106  { offsetof(xfs_sb_t, sb_logsectlog), 0 },
107  { offsetof(xfs_sb_t, sb_logsectsize),0 },
108  { offsetof(xfs_sb_t, sb_logsunit), 0 },
109  { offsetof(xfs_sb_t, sb_features2), 0 },
110  { offsetof(xfs_sb_t, sb_bad_features2), 0 },
111  { sizeof(xfs_sb_t), 0 }
112 };
113 
114 static DEFINE_MUTEX(xfs_uuid_table_mutex);
115 static int xfs_uuid_table_size;
116 static uuid_t *xfs_uuid_table;
117 
118 /*
119  * See if the UUID is unique among mounted XFS filesystems.
120  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
121  */
122 STATIC int
124  struct xfs_mount *mp)
125 {
126  uuid_t *uuid = &mp->m_sb.sb_uuid;
127  int hole, i;
128 
129  if (mp->m_flags & XFS_MOUNT_NOUUID)
130  return 0;
131 
132  if (uuid_is_nil(uuid)) {
133  xfs_warn(mp, "Filesystem has nil UUID - can't mount");
134  return XFS_ERROR(EINVAL);
135  }
136 
137  mutex_lock(&xfs_uuid_table_mutex);
138  for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
139  if (uuid_is_nil(&xfs_uuid_table[i])) {
140  hole = i;
141  continue;
142  }
143  if (uuid_equal(uuid, &xfs_uuid_table[i]))
144  goto out_duplicate;
145  }
146 
147  if (hole < 0) {
148  xfs_uuid_table = kmem_realloc(xfs_uuid_table,
149  (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
150  xfs_uuid_table_size * sizeof(*xfs_uuid_table),
151  KM_SLEEP);
152  hole = xfs_uuid_table_size++;
153  }
154  xfs_uuid_table[hole] = *uuid;
155  mutex_unlock(&xfs_uuid_table_mutex);
156 
157  return 0;
158 
159  out_duplicate:
160  mutex_unlock(&xfs_uuid_table_mutex);
161  xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
162  return XFS_ERROR(EINVAL);
163 }
164 
165 STATIC void
167  struct xfs_mount *mp)
168 {
169  uuid_t *uuid = &mp->m_sb.sb_uuid;
170  int i;
171 
172  if (mp->m_flags & XFS_MOUNT_NOUUID)
173  return;
174 
175  mutex_lock(&xfs_uuid_table_mutex);
176  for (i = 0; i < xfs_uuid_table_size; i++) {
177  if (uuid_is_nil(&xfs_uuid_table[i]))
178  continue;
179  if (!uuid_equal(uuid, &xfs_uuid_table[i]))
180  continue;
181  memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
182  break;
183  }
184  ASSERT(i < xfs_uuid_table_size);
185  mutex_unlock(&xfs_uuid_table_mutex);
186 }
187 
188 
189 /*
190  * Reference counting access wrappers to the perag structures.
191  * Because we never free per-ag structures, the only thing we
192  * have to protect against changes is the tree structure itself.
193  */
194 struct xfs_perag *
195 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
196 {
197  struct xfs_perag *pag;
198  int ref = 0;
199 
200  rcu_read_lock();
201  pag = radix_tree_lookup(&mp->m_perag_tree, agno);
202  if (pag) {
203  ASSERT(atomic_read(&pag->pag_ref) >= 0);
204  ref = atomic_inc_return(&pag->pag_ref);
205  }
206  rcu_read_unlock();
207  trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
208  return pag;
209 }
210 
211 /*
212  * search from @first to find the next perag with the given tag set.
213  */
214 struct xfs_perag *
216  struct xfs_mount *mp,
218  int tag)
219 {
220  struct xfs_perag *pag;
221  int found;
222  int ref;
223 
224  rcu_read_lock();
225  found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
226  (void **)&pag, first, 1, tag);
227  if (found <= 0) {
228  rcu_read_unlock();
229  return NULL;
230  }
231  ref = atomic_inc_return(&pag->pag_ref);
232  rcu_read_unlock();
233  trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
234  return pag;
235 }
236 
237 void
239 {
240  int ref;
241 
242  ASSERT(atomic_read(&pag->pag_ref) > 0);
243  ref = atomic_dec_return(&pag->pag_ref);
244  trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
245 }
246 
247 STATIC void
249  struct rcu_head *head)
250 {
251  struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
252 
253  ASSERT(atomic_read(&pag->pag_ref) == 0);
254  kmem_free(pag);
255 }
256 
257 /*
258  * Free up the per-ag resources associated with the mount structure.
259  */
260 STATIC void
262  xfs_mount_t *mp)
263 {
264  xfs_agnumber_t agno;
265  struct xfs_perag *pag;
266 
267  for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
268  spin_lock(&mp->m_perag_lock);
269  pag = radix_tree_delete(&mp->m_perag_tree, agno);
270  spin_unlock(&mp->m_perag_lock);
271  ASSERT(pag);
272  ASSERT(atomic_read(&pag->pag_ref) == 0);
273  call_rcu(&pag->rcu_head, __xfs_free_perag);
274  }
275 }
276 
277 /*
278  * Check size of device based on the (data/realtime) block count.
279  * Note: this check is used by the growfs code as well as mount.
280  */
281 int
283  xfs_sb_t *sbp,
284  __uint64_t nblocks)
285 {
286  ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
287  ASSERT(sbp->sb_blocklog >= BBSHIFT);
288 
289 #if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
290  if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
291  return EFBIG;
292 #else /* Limited by UINT_MAX of sectors */
293  if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
294  return EFBIG;
295 #endif
296  return 0;
297 }
298 
299 /*
300  * Check the validity of the SB found.
301  */
302 STATIC int
304  xfs_mount_t *mp,
305  xfs_sb_t *sbp,
306  int flags)
307 {
308  int loud = !(flags & XFS_MFSI_QUIET);
309 
310  /*
311  * If the log device and data device have the
312  * same device number, the log is internal.
313  * Consequently, the sb_logstart should be non-zero. If
314  * we have a zero sb_logstart in this case, we may be trying to mount
315  * a volume filesystem in a non-volume manner.
316  */
317  if (sbp->sb_magicnum != XFS_SB_MAGIC) {
318  if (loud)
319  xfs_warn(mp, "bad magic number");
320  return XFS_ERROR(EWRONGFS);
321  }
322 
323  if (!xfs_sb_good_version(sbp)) {
324  if (loud)
325  xfs_warn(mp, "bad version");
326  return XFS_ERROR(EWRONGFS);
327  }
328 
329  if (unlikely(
330  sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
331  if (loud)
332  xfs_warn(mp,
333  "filesystem is marked as having an external log; "
334  "specify logdev on the mount command line.");
335  return XFS_ERROR(EINVAL);
336  }
337 
338  if (unlikely(
339  sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
340  if (loud)
341  xfs_warn(mp,
342  "filesystem is marked as having an internal log; "
343  "do not specify logdev on the mount command line.");
344  return XFS_ERROR(EINVAL);
345  }
346 
347  /*
348  * More sanity checking. Most of these were stolen directly from
349  * xfs_repair.
350  */
351  if (unlikely(
352  sbp->sb_agcount <= 0 ||
357  sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
362  sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
367  sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
368  (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
369  (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
370  (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
371  (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
372  sbp->sb_dblocks == 0 ||
373  sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
374  sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
375  if (loud)
376  XFS_CORRUPTION_ERROR("SB sanity check failed",
377  XFS_ERRLEVEL_LOW, mp, sbp);
378  return XFS_ERROR(EFSCORRUPTED);
379  }
380 
381  /*
382  * Until this is fixed only page-sized or smaller data blocks work.
383  */
384  if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
385  if (loud) {
386  xfs_warn(mp,
387  "File system with blocksize %d bytes. "
388  "Only pagesize (%ld) or less will currently work.",
389  sbp->sb_blocksize, PAGE_SIZE);
390  }
391  return XFS_ERROR(ENOSYS);
392  }
393 
394  /*
395  * Currently only very few inode sizes are supported.
396  */
397  switch (sbp->sb_inodesize) {
398  case 256:
399  case 512:
400  case 1024:
401  case 2048:
402  break;
403  default:
404  if (loud)
405  xfs_warn(mp, "inode size of %d bytes not supported",
406  sbp->sb_inodesize);
407  return XFS_ERROR(ENOSYS);
408  }
409 
410  if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
412  if (loud)
413  xfs_warn(mp,
414  "file system too large to be mounted on this system.");
415  return XFS_ERROR(EFBIG);
416  }
417 
418  if (unlikely(sbp->sb_inprogress)) {
419  if (loud)
420  xfs_warn(mp, "file system busy");
421  return XFS_ERROR(EFSCORRUPTED);
422  }
423 
424  /*
425  * Version 1 directory format has never worked on Linux.
426  */
427  if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
428  if (loud)
429  xfs_warn(mp,
430  "file system using version 1 directory format");
431  return XFS_ERROR(ENOSYS);
432  }
433 
434  return 0;
435 }
436 
437 int
439  xfs_mount_t *mp,
440  xfs_agnumber_t agcount,
441  xfs_agnumber_t *maxagi)
442 {
444  xfs_agnumber_t first_initialised = 0;
445  xfs_perag_t *pag;
446  xfs_agino_t agino;
447  xfs_ino_t ino;
448  xfs_sb_t *sbp = &mp->m_sb;
449  int error = -ENOMEM;
450 
451  /*
452  * Walk the current per-ag tree so we don't try to initialise AGs
453  * that already exist (growfs case). Allocate and insert all the
454  * AGs we don't find ready for initialisation.
455  */
456  for (index = 0; index < agcount; index++) {
457  pag = xfs_perag_get(mp, index);
458  if (pag) {
459  xfs_perag_put(pag);
460  continue;
461  }
462  if (!first_initialised)
463  first_initialised = index;
464 
465  pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
466  if (!pag)
467  goto out_unwind;
468  pag->pag_agno = index;
469  pag->pag_mount = mp;
470  spin_lock_init(&pag->pag_ici_lock);
471  mutex_init(&pag->pag_ici_reclaim_lock);
472  INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
473  spin_lock_init(&pag->pag_buf_lock);
474  pag->pag_buf_tree = RB_ROOT;
475 
477  goto out_unwind;
478 
479  spin_lock(&mp->m_perag_lock);
480  if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
481  BUG();
482  spin_unlock(&mp->m_perag_lock);
483  radix_tree_preload_end();
484  error = -EEXIST;
485  goto out_unwind;
486  }
487  spin_unlock(&mp->m_perag_lock);
488  radix_tree_preload_end();
489  }
490 
491  /*
492  * If we mount with the inode64 option, or no inode overflows
493  * the legacy 32-bit address space clear the inode32 option.
494  */
495  agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
496  ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
497 
498  if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
499  mp->m_flags |= XFS_MOUNT_32BITINODES;
500  else
501  mp->m_flags &= ~XFS_MOUNT_32BITINODES;
502 
503  if (mp->m_flags & XFS_MOUNT_32BITINODES)
504  index = xfs_set_inode32(mp);
505  else
506  index = xfs_set_inode64(mp);
507 
508  if (maxagi)
509  *maxagi = index;
510  return 0;
511 
512 out_unwind:
513  kmem_free(pag);
514  for (; index > first_initialised; index--) {
515  pag = radix_tree_delete(&mp->m_perag_tree, index);
516  kmem_free(pag);
517  }
518  return error;
519 }
520 
521 void
523  struct xfs_mount *mp,
524  xfs_dsb_t *from)
525 {
526  struct xfs_sb *to = &mp->m_sb;
527 
528  to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
530  to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
531  to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
532  to->sb_rextents = be64_to_cpu(from->sb_rextents);
533  memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
534  to->sb_logstart = be64_to_cpu(from->sb_logstart);
535  to->sb_rootino = be64_to_cpu(from->sb_rootino);
536  to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
537  to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
538  to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
539  to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
540  to->sb_agcount = be32_to_cpu(from->sb_agcount);
544  to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
547  memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
548  to->sb_blocklog = from->sb_blocklog;
549  to->sb_sectlog = from->sb_sectlog;
550  to->sb_inodelog = from->sb_inodelog;
551  to->sb_inopblog = from->sb_inopblog;
552  to->sb_agblklog = from->sb_agblklog;
553  to->sb_rextslog = from->sb_rextslog;
554  to->sb_inprogress = from->sb_inprogress;
555  to->sb_imax_pct = from->sb_imax_pct;
556  to->sb_icount = be64_to_cpu(from->sb_icount);
557  to->sb_ifree = be64_to_cpu(from->sb_ifree);
558  to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
560  to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
561  to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
562  to->sb_qflags = be16_to_cpu(from->sb_qflags);
563  to->sb_flags = from->sb_flags;
564  to->sb_shared_vn = from->sb_shared_vn;
566  to->sb_unit = be32_to_cpu(from->sb_unit);
567  to->sb_width = be32_to_cpu(from->sb_width);
568  to->sb_dirblklog = from->sb_dirblklog;
569  to->sb_logsectlog = from->sb_logsectlog;
571  to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
574 }
575 
576 /*
577  * Copy in core superblock to ondisk one.
578  *
579  * The fields argument is mask of superblock fields to copy.
580  */
581 void
583  xfs_dsb_t *to,
584  xfs_sb_t *from,
585  __int64_t fields)
586 {
587  xfs_caddr_t to_ptr = (xfs_caddr_t)to;
588  xfs_caddr_t from_ptr = (xfs_caddr_t)from;
590  int first;
591  int size;
592 
593  ASSERT(fields);
594  if (!fields)
595  return;
596 
597  while (fields) {
598  f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
599  first = xfs_sb_info[f].offset;
600  size = xfs_sb_info[f + 1].offset - first;
601 
602  ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
603 
604  if (size == 1 || xfs_sb_info[f].type == 1) {
605  memcpy(to_ptr + first, from_ptr + first, size);
606  } else {
607  switch (size) {
608  case 2:
609  *(__be16 *)(to_ptr + first) =
610  cpu_to_be16(*(__u16 *)(from_ptr + first));
611  break;
612  case 4:
613  *(__be32 *)(to_ptr + first) =
614  cpu_to_be32(*(__u32 *)(from_ptr + first));
615  break;
616  case 8:
617  *(__be64 *)(to_ptr + first) =
618  cpu_to_be64(*(__u64 *)(from_ptr + first));
619  break;
620  default:
621  ASSERT(0);
622  }
623  }
624 
625  fields &= ~(1LL << f);
626  }
627 }
628 
629 /*
630  * xfs_readsb
631  *
632  * Does the initial read of the superblock.
633  */
634 int
635 xfs_readsb(xfs_mount_t *mp, int flags)
636 {
637  unsigned int sector_size;
638  xfs_buf_t *bp;
639  int error;
640  int loud = !(flags & XFS_MFSI_QUIET);
641 
642  ASSERT(mp->m_sb_bp == NULL);
643  ASSERT(mp->m_ddev_targp != NULL);
644 
645  /*
646  * Allocate a (locked) buffer to hold the superblock.
647  * This will be kept around at all times to optimize
648  * access to the superblock.
649  */
650  sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
651 
652 reread:
653  bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
654  BTOBB(sector_size), 0);
655  if (!bp) {
656  if (loud)
657  xfs_warn(mp, "SB buffer read failed");
658  return EIO;
659  }
660 
661  /*
662  * Initialize the mount structure from the superblock.
663  * But first do some basic consistency checking.
664  */
666  error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
667  if (error) {
668  if (loud)
669  xfs_warn(mp, "SB validate failed");
670  goto release_buf;
671  }
672 
673  /*
674  * We must be able to do sector-sized and sector-aligned IO.
675  */
676  if (sector_size > mp->m_sb.sb_sectsize) {
677  if (loud)
678  xfs_warn(mp, "device supports %u byte sectors (not %u)",
679  sector_size, mp->m_sb.sb_sectsize);
680  error = ENOSYS;
681  goto release_buf;
682  }
683 
684  /*
685  * If device sector size is smaller than the superblock size,
686  * re-read the superblock so the buffer is correctly sized.
687  */
688  if (sector_size < mp->m_sb.sb_sectsize) {
689  xfs_buf_relse(bp);
690  sector_size = mp->m_sb.sb_sectsize;
691  goto reread;
692  }
693 
694  /* Initialize per-cpu counters */
695  xfs_icsb_reinit_counters(mp);
696 
697  mp->m_sb_bp = bp;
698  xfs_buf_unlock(bp);
699  return 0;
700 
701 release_buf:
702  xfs_buf_relse(bp);
703  return error;
704 }
705 
706 
707 /*
708  * xfs_mount_common
709  *
710  * Mount initialization code establishing various mount
711  * fields from the superblock associated with the given
712  * mount structure
713  */
714 STATIC void
715 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
716 {
717  mp->m_agfrotor = mp->m_agirotor = 0;
718  spin_lock_init(&mp->m_agirotor_lock);
719  mp->m_maxagi = mp->m_sb.sb_agcount;
720  mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
721  mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
722  mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
723  mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
724  mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
725  mp->m_blockmask = sbp->sb_blocksize - 1;
726  mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
727  mp->m_blockwmask = mp->m_blockwsize - 1;
728 
729  mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
730  mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
731  mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
732  mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
733 
734  mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
735  mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
736  mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
737  mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
738 
739  mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
740  mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
741  mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
742  mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
743 
744  mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
745  mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
746  sbp->sb_inopblock);
747  mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
748 }
749 
750 /*
751  * xfs_initialize_perag_data
752  *
753  * Read in each per-ag structure so we can count up the number of
754  * allocated inodes, free inodes and used filesystem blocks as this
755  * information is no longer persistent in the superblock. Once we have
756  * this information, write it into the in-core superblock structure.
757  */
758 STATIC int
760 {
762  xfs_perag_t *pag;
763  xfs_sb_t *sbp = &mp->m_sb;
764  uint64_t ifree = 0;
765  uint64_t ialloc = 0;
766  uint64_t bfree = 0;
767  uint64_t bfreelst = 0;
768  uint64_t btree = 0;
769  int error;
770 
771  for (index = 0; index < agcount; index++) {
772  /*
773  * read the agf, then the agi. This gets us
774  * all the information we need and populates the
775  * per-ag structures for us.
776  */
777  error = xfs_alloc_pagf_init(mp, NULL, index, 0);
778  if (error)
779  return error;
780 
781  error = xfs_ialloc_pagi_init(mp, NULL, index);
782  if (error)
783  return error;
784  pag = xfs_perag_get(mp, index);
785  ifree += pag->pagi_freecount;
786  ialloc += pag->pagi_count;
787  bfree += pag->pagf_freeblks;
788  bfreelst += pag->pagf_flcount;
789  btree += pag->pagf_btreeblks;
790  xfs_perag_put(pag);
791  }
792  /*
793  * Overwrite incore superblock counters with just-read data
794  */
795  spin_lock(&mp->m_sb_lock);
796  sbp->sb_ifree = ifree;
797  sbp->sb_icount = ialloc;
798  sbp->sb_fdblocks = bfree + bfreelst + btree;
799  spin_unlock(&mp->m_sb_lock);
800 
801  /* Fixup the per-cpu counters as well. */
802  xfs_icsb_reinit_counters(mp);
803 
804  return 0;
805 }
806 
807 /*
808  * Update alignment values based on mount options and sb values
809  */
810 STATIC int
812 {
813  xfs_sb_t *sbp = &(mp->m_sb);
814 
815  if (mp->m_dalign) {
816  /*
817  * If stripe unit and stripe width are not multiples
818  * of the fs blocksize turn off alignment.
819  */
820  if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
821  (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
822  if (mp->m_flags & XFS_MOUNT_RETERR) {
823  xfs_warn(mp, "alignment check failed: "
824  "(sunit/swidth vs. blocksize)");
825  return XFS_ERROR(EINVAL);
826  }
827  mp->m_dalign = mp->m_swidth = 0;
828  } else {
829  /*
830  * Convert the stripe unit and width to FSBs.
831  */
832  mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
833  if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
834  if (mp->m_flags & XFS_MOUNT_RETERR) {
835  xfs_warn(mp, "alignment check failed: "
836  "(sunit/swidth vs. ag size)");
837  return XFS_ERROR(EINVAL);
838  }
839  xfs_warn(mp,
840  "stripe alignment turned off: sunit(%d)/swidth(%d) "
841  "incompatible with agsize(%d)",
842  mp->m_dalign, mp->m_swidth,
843  sbp->sb_agblocks);
844 
845  mp->m_dalign = 0;
846  mp->m_swidth = 0;
847  } else if (mp->m_dalign) {
848  mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
849  } else {
850  if (mp->m_flags & XFS_MOUNT_RETERR) {
851  xfs_warn(mp, "alignment check failed: "
852  "sunit(%d) less than bsize(%d)",
853  mp->m_dalign,
854  mp->m_blockmask +1);
855  return XFS_ERROR(EINVAL);
856  }
857  mp->m_swidth = 0;
858  }
859  }
860 
861  /*
862  * Update superblock with new values
863  * and log changes
864  */
865  if (xfs_sb_version_hasdalign(sbp)) {
866  if (sbp->sb_unit != mp->m_dalign) {
867  sbp->sb_unit = mp->m_dalign;
868  mp->m_update_flags |= XFS_SB_UNIT;
869  }
870  if (sbp->sb_width != mp->m_swidth) {
871  sbp->sb_width = mp->m_swidth;
872  mp->m_update_flags |= XFS_SB_WIDTH;
873  }
874  }
875  } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
876  xfs_sb_version_hasdalign(&mp->m_sb)) {
877  mp->m_dalign = sbp->sb_unit;
878  mp->m_swidth = sbp->sb_width;
879  }
880 
881  return 0;
882 }
883 
884 /*
885  * Set the maximum inode count for this filesystem
886  */
887 STATIC void
888 xfs_set_maxicount(xfs_mount_t *mp)
889 {
890  xfs_sb_t *sbp = &(mp->m_sb);
891  __uint64_t icount;
892 
893  if (sbp->sb_imax_pct) {
894  /*
895  * Make sure the maximum inode count is a multiple
896  * of the units we allocate inodes in.
897  */
898  icount = sbp->sb_dblocks * sbp->sb_imax_pct;
899  do_div(icount, 100);
900  do_div(icount, mp->m_ialloc_blks);
901  mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
902  sbp->sb_inopblog;
903  } else {
904  mp->m_maxicount = 0;
905  }
906 }
907 
908 /*
909  * Set the default minimum read and write sizes unless
910  * already specified in a mount option.
911  * We use smaller I/O sizes when the file system
912  * is being used for NFS service (wsync mount option).
913  */
914 STATIC void
915 xfs_set_rw_sizes(xfs_mount_t *mp)
916 {
917  xfs_sb_t *sbp = &(mp->m_sb);
918  int readio_log, writeio_log;
919 
920  if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
921  if (mp->m_flags & XFS_MOUNT_WSYNC) {
922  readio_log = XFS_WSYNC_READIO_LOG;
923  writeio_log = XFS_WSYNC_WRITEIO_LOG;
924  } else {
925  readio_log = XFS_READIO_LOG_LARGE;
926  writeio_log = XFS_WRITEIO_LOG_LARGE;
927  }
928  } else {
929  readio_log = mp->m_readio_log;
930  writeio_log = mp->m_writeio_log;
931  }
932 
933  if (sbp->sb_blocklog > readio_log) {
934  mp->m_readio_log = sbp->sb_blocklog;
935  } else {
936  mp->m_readio_log = readio_log;
937  }
938  mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
939  if (sbp->sb_blocklog > writeio_log) {
940  mp->m_writeio_log = sbp->sb_blocklog;
941  } else {
942  mp->m_writeio_log = writeio_log;
943  }
944  mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
945 }
946 
947 /*
948  * precalculate the low space thresholds for dynamic speculative preallocation.
949  */
950 void
952  struct xfs_mount *mp)
953 {
954  int i;
955 
956  for (i = 0; i < XFS_LOWSP_MAX; i++) {
957  __uint64_t space = mp->m_sb.sb_dblocks;
958 
959  do_div(space, 100);
960  mp->m_low_space[i] = space * (i + 1);
961  }
962 }
963 
964 
965 /*
966  * Set whether we're using inode alignment.
967  */
968 STATIC void
970 {
971  if (xfs_sb_version_hasalign(&mp->m_sb) &&
972  mp->m_sb.sb_inoalignmt >=
973  XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
974  mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
975  else
976  mp->m_inoalign_mask = 0;
977  /*
978  * If we are using stripe alignment, check whether
979  * the stripe unit is a multiple of the inode alignment
980  */
981  if (mp->m_dalign && mp->m_inoalign_mask &&
982  !(mp->m_dalign & mp->m_inoalign_mask))
983  mp->m_sinoalign = mp->m_dalign;
984  else
985  mp->m_sinoalign = 0;
986 }
987 
988 /*
989  * Check that the data (and log if separate) are an ok size.
990  */
991 STATIC int
992 xfs_check_sizes(xfs_mount_t *mp)
993 {
994  xfs_buf_t *bp;
995  xfs_daddr_t d;
996 
997  d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
998  if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
999  xfs_warn(mp, "filesystem size mismatch detected");
1000  return XFS_ERROR(EFBIG);
1001  }
1002  bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1003  d - XFS_FSS_TO_BB(mp, 1),
1004  XFS_FSS_TO_BB(mp, 1), 0);
1005  if (!bp) {
1006  xfs_warn(mp, "last sector read failed");
1007  return EIO;
1008  }
1009  xfs_buf_relse(bp);
1010 
1011  if (mp->m_logdev_targp != mp->m_ddev_targp) {
1012  d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1013  if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1014  xfs_warn(mp, "log size mismatch detected");
1015  return XFS_ERROR(EFBIG);
1016  }
1017  bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1018  d - XFS_FSB_TO_BB(mp, 1),
1019  XFS_FSB_TO_BB(mp, 1), 0);
1020  if (!bp) {
1021  xfs_warn(mp, "log device read failed");
1022  return EIO;
1023  }
1024  xfs_buf_relse(bp);
1025  }
1026  return 0;
1027 }
1028 
1029 /*
1030  * Clear the quotaflags in memory and in the superblock.
1031  */
1032 int
1034  struct xfs_mount *mp)
1035 {
1036  int error;
1037  struct xfs_trans *tp;
1038 
1039  mp->m_qflags = 0;
1040 
1041  /*
1042  * It is OK to look at sb_qflags here in mount path,
1043  * without m_sb_lock.
1044  */
1045  if (mp->m_sb.sb_qflags == 0)
1046  return 0;
1047  spin_lock(&mp->m_sb_lock);
1048  mp->m_sb.sb_qflags = 0;
1049  spin_unlock(&mp->m_sb_lock);
1050 
1051  /*
1052  * If the fs is readonly, let the incore superblock run
1053  * with quotas off but don't flush the update out to disk
1054  */
1055  if (mp->m_flags & XFS_MOUNT_RDONLY)
1056  return 0;
1057 
1059  error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1061  if (error) {
1062  xfs_trans_cancel(tp, 0);
1063  xfs_alert(mp, "%s: Superblock update failed!", __func__);
1064  return error;
1065  }
1066 
1068  return xfs_trans_commit(tp, 0);
1069 }
1070 
1071 __uint64_t
1073 {
1074  __uint64_t resblks;
1075 
1076  /*
1077  * We default to 5% or 8192 fsbs of space reserved, whichever is
1078  * smaller. This is intended to cover concurrent allocation
1079  * transactions when we initially hit enospc. These each require a 4
1080  * block reservation. Hence by default we cover roughly 2000 concurrent
1081  * allocation reservations.
1082  */
1083  resblks = mp->m_sb.sb_dblocks;
1084  do_div(resblks, 20);
1085  resblks = min_t(__uint64_t, resblks, 8192);
1086  return resblks;
1087 }
1088 
1089 /*
1090  * This function does the following on an initial mount of a file system:
1091  * - reads the superblock from disk and init the mount struct
1092  * - if we're a 32-bit kernel, do a size check on the superblock
1093  * so we don't mount terabyte filesystems
1094  * - init mount struct realtime fields
1095  * - allocate inode hash table for fs
1096  * - init directory manager
1097  * - perform recovery and init the log manager
1098  */
1099 int
1101  xfs_mount_t *mp)
1102 {
1103  xfs_sb_t *sbp = &(mp->m_sb);
1104  xfs_inode_t *rip;
1105  __uint64_t resblks;
1106  uint quotamount = 0;
1107  uint quotaflags = 0;
1108  int error = 0;
1109 
1110  xfs_mount_common(mp, sbp);
1111 
1112  /*
1113  * Check for a mismatched features2 values. Older kernels
1114  * read & wrote into the wrong sb offset for sb_features2
1115  * on some platforms due to xfs_sb_t not being 64bit size aligned
1116  * when sb_features2 was added, which made older superblock
1117  * reading/writing routines swap it as a 64-bit value.
1118  *
1119  * For backwards compatibility, we make both slots equal.
1120  *
1121  * If we detect a mismatched field, we OR the set bits into the
1122  * existing features2 field in case it has already been modified; we
1123  * don't want to lose any features. We then update the bad location
1124  * with the ORed value so that older kernels will see any features2
1125  * flags, and mark the two fields as needing updates once the
1126  * transaction subsystem is online.
1127  */
1128  if (xfs_sb_has_mismatched_features2(sbp)) {
1129  xfs_warn(mp, "correcting sb_features alignment problem");
1130  sbp->sb_features2 |= sbp->sb_bad_features2;
1131  sbp->sb_bad_features2 = sbp->sb_features2;
1132  mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
1133 
1134  /*
1135  * Re-check for ATTR2 in case it was found in bad_features2
1136  * slot.
1137  */
1138  if (xfs_sb_version_hasattr2(&mp->m_sb) &&
1139  !(mp->m_flags & XFS_MOUNT_NOATTR2))
1140  mp->m_flags |= XFS_MOUNT_ATTR2;
1141  }
1142 
1143  if (xfs_sb_version_hasattr2(&mp->m_sb) &&
1144  (mp->m_flags & XFS_MOUNT_NOATTR2)) {
1145  xfs_sb_version_removeattr2(&mp->m_sb);
1146  mp->m_update_flags |= XFS_SB_FEATURES2;
1147 
1148  /* update sb_versionnum for the clearing of the morebits */
1149  if (!sbp->sb_features2)
1150  mp->m_update_flags |= XFS_SB_VERSIONNUM;
1151  }
1152 
1153  /*
1154  * Check if sb_agblocks is aligned at stripe boundary
1155  * If sb_agblocks is NOT aligned turn off m_dalign since
1156  * allocator alignment is within an ag, therefore ag has
1157  * to be aligned at stripe boundary.
1158  */
1159  error = xfs_update_alignment(mp);
1160  if (error)
1161  goto out;
1162 
1167 
1168  xfs_set_maxicount(mp);
1169 
1170  error = xfs_uuid_mount(mp);
1171  if (error)
1172  goto out;
1173 
1174  /*
1175  * Set the minimum read and write sizes
1176  */
1177  xfs_set_rw_sizes(mp);
1178 
1179  /* set the low space thresholds for dynamic preallocation */
1181 
1182  /*
1183  * Set the inode cluster size.
1184  * This may still be overridden by the file system
1185  * block size if it is larger than the chosen cluster size.
1186  */
1187  mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
1188 
1189  /*
1190  * Set inode alignment fields
1191  */
1193 
1194  /*
1195  * Check that the data (and log if separate) are an ok size.
1196  */
1197  error = xfs_check_sizes(mp);
1198  if (error)
1199  goto out_remove_uuid;
1200 
1201  /*
1202  * Initialize realtime fields in the mount structure
1203  */
1204  error = xfs_rtmount_init(mp);
1205  if (error) {
1206  xfs_warn(mp, "RT mount failed");
1207  goto out_remove_uuid;
1208  }
1209 
1210  /*
1211  * Copies the low order bits of the timestamp and the randomly
1212  * set "sequence" number out of a UUID.
1213  */
1214  uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
1215 
1216  mp->m_dmevmask = 0; /* not persistent; set after each mount */
1217 
1218  xfs_dir_mount(mp);
1219 
1220  /*
1221  * Initialize the attribute manager's entries.
1222  */
1223  mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
1224 
1225  /*
1226  * Initialize the precomputed transaction reservations values.
1227  */
1228  xfs_trans_init(mp);
1229 
1230  /*
1231  * Allocate and initialize the per-ag data.
1232  */
1233  spin_lock_init(&mp->m_perag_lock);
1234  INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1235  error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1236  if (error) {
1237  xfs_warn(mp, "Failed per-ag init: %d", error);
1238  goto out_remove_uuid;
1239  }
1240 
1241  if (!sbp->sb_logblocks) {
1242  xfs_warn(mp, "no log defined");
1243  XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1244  error = XFS_ERROR(EFSCORRUPTED);
1245  goto out_free_perag;
1246  }
1247 
1248  /*
1249  * log's mount-time initialization. Perform 1st part recovery if needed
1250  */
1251  error = xfs_log_mount(mp, mp->m_logdev_targp,
1252  XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1253  XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1254  if (error) {
1255  xfs_warn(mp, "log mount failed");
1256  goto out_fail_wait;
1257  }
1258 
1259  /*
1260  * Now the log is mounted, we know if it was an unclean shutdown or
1261  * not. If it was, with the first phase of recovery has completed, we
1262  * have consistent AG blocks on disk. We have not recovered EFIs yet,
1263  * but they are recovered transactionally in the second recovery phase
1264  * later.
1265  *
1266  * Hence we can safely re-initialise incore superblock counters from
1267  * the per-ag data. These may not be correct if the filesystem was not
1268  * cleanly unmounted, so we need to wait for recovery to finish before
1269  * doing this.
1270  *
1271  * If the filesystem was cleanly unmounted, then we can trust the
1272  * values in the superblock to be correct and we don't need to do
1273  * anything here.
1274  *
1275  * If we are currently making the filesystem, the initialisation will
1276  * fail as the perag data is in an undefined state.
1277  */
1278  if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1279  !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1280  !mp->m_sb.sb_inprogress) {
1281  error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1282  if (error)
1283  goto out_fail_wait;
1284  }
1285 
1286  /*
1287  * Get and sanity-check the root inode.
1288  * Save the pointer to it in the mount structure.
1289  */
1290  error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1291  if (error) {
1292  xfs_warn(mp, "failed to read root inode");
1293  goto out_log_dealloc;
1294  }
1295 
1296  ASSERT(rip != NULL);
1297 
1298  if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
1299  xfs_warn(mp, "corrupted root inode %llu: not a directory",
1300  (unsigned long long)rip->i_ino);
1301  xfs_iunlock(rip, XFS_ILOCK_EXCL);
1302  XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1303  mp);
1304  error = XFS_ERROR(EFSCORRUPTED);
1305  goto out_rele_rip;
1306  }
1307  mp->m_rootip = rip; /* save it */
1308 
1309  xfs_iunlock(rip, XFS_ILOCK_EXCL);
1310 
1311  /*
1312  * Initialize realtime inode pointers in the mount structure
1313  */
1314  error = xfs_rtmount_inodes(mp);
1315  if (error) {
1316  /*
1317  * Free up the root inode.
1318  */
1319  xfs_warn(mp, "failed to read RT inodes");
1320  goto out_rele_rip;
1321  }
1322 
1323  /*
1324  * If this is a read-only mount defer the superblock updates until
1325  * the next remount into writeable mode. Otherwise we would never
1326  * perform the update e.g. for the root filesystem.
1327  */
1328  if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1329  error = xfs_mount_log_sb(mp, mp->m_update_flags);
1330  if (error) {
1331  xfs_warn(mp, "failed to write sb changes");
1332  goto out_rtunmount;
1333  }
1334  }
1335 
1336  /*
1337  * Initialise the XFS quota management subsystem for this mount
1338  */
1339  if (XFS_IS_QUOTA_RUNNING(mp)) {
1340  error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
1341  if (error)
1342  goto out_rtunmount;
1343  } else {
1344  ASSERT(!XFS_IS_QUOTA_ON(mp));
1345 
1346  /*
1347  * If a file system had quotas running earlier, but decided to
1348  * mount without -o uquota/pquota/gquota options, revoke the
1349  * quotachecked license.
1350  */
1351  if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1352  xfs_notice(mp, "resetting quota flags");
1353  error = xfs_mount_reset_sbqflags(mp);
1354  if (error)
1355  return error;
1356  }
1357  }
1358 
1359  /*
1360  * Finish recovering the file system. This part needed to be
1361  * delayed until after the root and real-time bitmap inodes
1362  * were consistently read in.
1363  */
1364  error = xfs_log_mount_finish(mp);
1365  if (error) {
1366  xfs_warn(mp, "log mount finish failed");
1367  goto out_rtunmount;
1368  }
1369 
1370  /*
1371  * Complete the quota initialisation, post-log-replay component.
1372  */
1373  if (quotamount) {
1374  ASSERT(mp->m_qflags == 0);
1375  mp->m_qflags = quotaflags;
1376 
1377  xfs_qm_mount_quotas(mp);
1378  }
1379 
1380  /*
1381  * Now we are mounted, reserve a small amount of unused space for
1382  * privileged transactions. This is needed so that transaction
1383  * space required for critical operations can dip into this pool
1384  * when at ENOSPC. This is needed for operations like create with
1385  * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1386  * are not allowed to use this reserved space.
1387  *
1388  * This may drive us straight to ENOSPC on mount, but that implies
1389  * we were already there on the last unmount. Warn if this occurs.
1390  */
1391  if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1392  resblks = xfs_default_resblks(mp);
1393  error = xfs_reserve_blocks(mp, &resblks, NULL);
1394  if (error)
1395  xfs_warn(mp,
1396  "Unable to allocate reserve blocks. Continuing without reserve pool.");
1397  }
1398 
1399  return 0;
1400 
1401  out_rtunmount:
1403  out_rele_rip:
1404  IRELE(rip);
1405  out_log_dealloc:
1406  xfs_log_unmount(mp);
1407  out_fail_wait:
1408  if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
1409  xfs_wait_buftarg(mp->m_logdev_targp);
1410  xfs_wait_buftarg(mp->m_ddev_targp);
1411  out_free_perag:
1412  xfs_free_perag(mp);
1413  out_remove_uuid:
1414  xfs_uuid_unmount(mp);
1415  out:
1416  return error;
1417 }
1418 
1419 /*
1420  * This flushes out the inodes,dquots and the superblock, unmounts the
1421  * log and makes sure that incore structures are freed.
1422  */
1423 void
1425  struct xfs_mount *mp)
1426 {
1427  __uint64_t resblks;
1428  int error;
1429 
1432  IRELE(mp->m_rootip);
1433 
1434  /*
1435  * We can potentially deadlock here if we have an inode cluster
1436  * that has been freed has its buffer still pinned in memory because
1437  * the transaction is still sitting in a iclog. The stale inodes
1438  * on that buffer will have their flush locks held until the
1439  * transaction hits the disk and the callbacks run. the inode
1440  * flush takes the flush lock unconditionally and with nothing to
1441  * push out the iclog we will never get that unlocked. hence we
1442  * need to force the log first.
1443  */
1444  xfs_log_force(mp, XFS_LOG_SYNC);
1445 
1446  /*
1447  * Flush all pending changes from the AIL.
1448  */
1449  xfs_ail_push_all_sync(mp->m_ail);
1450 
1451  /*
1452  * And reclaim all inodes. At this point there should be no dirty
1453  * inode, and none should be pinned or locked, but use synchronous
1454  * reclaim just to be sure.
1455  */
1457 
1458  xfs_qm_unmount(mp);
1459 
1460  /*
1461  * Flush out the log synchronously so that we know for sure
1462  * that nothing is pinned. This is important because bflush()
1463  * will skip pinned buffers.
1464  */
1465  xfs_log_force(mp, XFS_LOG_SYNC);
1466 
1467  /*
1468  * Unreserve any blocks we have so that when we unmount we don't account
1469  * the reserved free space as used. This is really only necessary for
1470  * lazy superblock counting because it trusts the incore superblock
1471  * counters to be absolutely correct on clean unmount.
1472  *
1473  * We don't bother correcting this elsewhere for lazy superblock
1474  * counting because on mount of an unclean filesystem we reconstruct the
1475  * correct counter value and this is irrelevant.
1476  *
1477  * For non-lazy counter filesystems, this doesn't matter at all because
1478  * we only every apply deltas to the superblock and hence the incore
1479  * value does not matter....
1480  */
1481  resblks = 0;
1482  error = xfs_reserve_blocks(mp, &resblks, NULL);
1483  if (error)
1484  xfs_warn(mp, "Unable to free reserved block pool. "
1485  "Freespace may not be correct on next mount.");
1486 
1487  error = xfs_log_sbcount(mp);
1488  if (error)
1489  xfs_warn(mp, "Unable to update superblock counters. "
1490  "Freespace may not be correct on next mount.");
1491 
1492  /*
1493  * At this point we might have modified the superblock again and thus
1494  * added an item to the AIL, thus flush it again.
1495  */
1496  xfs_ail_push_all_sync(mp->m_ail);
1497  xfs_wait_buftarg(mp->m_ddev_targp);
1498 
1499  /*
1500  * The superblock buffer is uncached and xfsaild_push() will lock and
1501  * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
1502  * here but a lock on the superblock buffer will block until iodone()
1503  * has completed.
1504  */
1505  xfs_buf_lock(mp->m_sb_bp);
1506  xfs_buf_unlock(mp->m_sb_bp);
1507 
1509  xfs_log_unmount(mp);
1510  xfs_uuid_unmount(mp);
1511 
1512 #if defined(DEBUG)
1513  xfs_errortag_clearall(mp, 0);
1514 #endif
1515  xfs_free_perag(mp);
1516 }
1517 
1518 int
1519 xfs_fs_writable(xfs_mount_t *mp)
1520 {
1521  return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
1522  (mp->m_flags & XFS_MOUNT_RDONLY));
1523 }
1524 
1525 /*
1526  * xfs_log_sbcount
1527  *
1528  * Sync the superblock counters to disk.
1529  *
1530  * Note this code can be called during the process of freezing, so
1531  * we may need to use the transaction allocator which does not
1532  * block when the transaction subsystem is in its frozen state.
1533  */
1534 int
1535 xfs_log_sbcount(xfs_mount_t *mp)
1536 {
1537  xfs_trans_t *tp;
1538  int error;
1539 
1540  if (!xfs_fs_writable(mp))
1541  return 0;
1542 
1543  xfs_icsb_sync_counters(mp, 0);
1544 
1545  /*
1546  * we don't need to do this if we are updating the superblock
1547  * counters on every modification.
1548  */
1549  if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1550  return 0;
1551 
1553  error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1555  if (error) {
1556  xfs_trans_cancel(tp, 0);
1557  return error;
1558  }
1559 
1561  xfs_trans_set_sync(tp);
1562  error = xfs_trans_commit(tp, 0);
1563  return error;
1564 }
1565 
1566 /*
1567  * xfs_mod_sb() can be used to copy arbitrary changes to the
1568  * in-core superblock into the superblock buffer to be logged.
1569  * It does not provide the higher level of locking that is
1570  * needed to protect the in-core superblock from concurrent
1571  * access.
1572  */
1573 void
1574 xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1575 {
1576  xfs_buf_t *bp;
1577  int first;
1578  int last;
1579  xfs_mount_t *mp;
1580  xfs_sb_field_t f;
1581 
1582  ASSERT(fields);
1583  if (!fields)
1584  return;
1585  mp = tp->t_mountp;
1586  bp = xfs_trans_getsb(tp, mp, 0);
1587  first = sizeof(xfs_sb_t);
1588  last = 0;
1589 
1590  /* translate/copy */
1591 
1592  xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1593 
1594  /* find modified range */
1595  f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1596  ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1597  last = xfs_sb_info[f + 1].offset - 1;
1598 
1599  f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1600  ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1601  first = xfs_sb_info[f].offset;
1602 
1603  xfs_trans_log_buf(tp, bp, first, last);
1604 }
1605 
1606 
1607 /*
1608  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1609  * a delta to a specified field in the in-core superblock. Simply
1610  * switch on the field indicated and apply the delta to that field.
1611  * Fields are not allowed to dip below zero, so if the delta would
1612  * do this do not apply it and return EINVAL.
1613  *
1614  * The m_sb_lock must be held when this routine is called.
1615  */
1616 STATIC int
1618  xfs_mount_t *mp,
1620  int64_t delta,
1621  int rsvd)
1622 {
1623  int scounter; /* short counter for 32 bit fields */
1624  long long lcounter; /* long counter for 64 bit fields */
1625  long long res_used, rem;
1626 
1627  /*
1628  * With the in-core superblock spin lock held, switch
1629  * on the indicated field. Apply the delta to the
1630  * proper field. If the fields value would dip below
1631  * 0, then do not apply the delta and return EINVAL.
1632  */
1633  switch (field) {
1634  case XFS_SBS_ICOUNT:
1635  lcounter = (long long)mp->m_sb.sb_icount;
1636  lcounter += delta;
1637  if (lcounter < 0) {
1638  ASSERT(0);
1639  return XFS_ERROR(EINVAL);
1640  }
1641  mp->m_sb.sb_icount = lcounter;
1642  return 0;
1643  case XFS_SBS_IFREE:
1644  lcounter = (long long)mp->m_sb.sb_ifree;
1645  lcounter += delta;
1646  if (lcounter < 0) {
1647  ASSERT(0);
1648  return XFS_ERROR(EINVAL);
1649  }
1650  mp->m_sb.sb_ifree = lcounter;
1651  return 0;
1652  case XFS_SBS_FDBLOCKS:
1653  lcounter = (long long)
1654  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1655  res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1656 
1657  if (delta > 0) { /* Putting blocks back */
1658  if (res_used > delta) {
1659  mp->m_resblks_avail += delta;
1660  } else {
1661  rem = delta - res_used;
1662  mp->m_resblks_avail = mp->m_resblks;
1663  lcounter += rem;
1664  }
1665  } else { /* Taking blocks away */
1666  lcounter += delta;
1667  if (lcounter >= 0) {
1668  mp->m_sb.sb_fdblocks = lcounter +
1669  XFS_ALLOC_SET_ASIDE(mp);
1670  return 0;
1671  }
1672 
1673  /*
1674  * We are out of blocks, use any available reserved
1675  * blocks if were allowed to.
1676  */
1677  if (!rsvd)
1678  return XFS_ERROR(ENOSPC);
1679 
1680  lcounter = (long long)mp->m_resblks_avail + delta;
1681  if (lcounter >= 0) {
1682  mp->m_resblks_avail = lcounter;
1683  return 0;
1684  }
1686  "Filesystem \"%s\": reserve blocks depleted! "
1687  "Consider increasing reserve pool size.",
1688  mp->m_fsname);
1689  return XFS_ERROR(ENOSPC);
1690  }
1691 
1692  mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1693  return 0;
1694  case XFS_SBS_FREXTENTS:
1695  lcounter = (long long)mp->m_sb.sb_frextents;
1696  lcounter += delta;
1697  if (lcounter < 0) {
1698  return XFS_ERROR(ENOSPC);
1699  }
1700  mp->m_sb.sb_frextents = lcounter;
1701  return 0;
1702  case XFS_SBS_DBLOCKS:
1703  lcounter = (long long)mp->m_sb.sb_dblocks;
1704  lcounter += delta;
1705  if (lcounter < 0) {
1706  ASSERT(0);
1707  return XFS_ERROR(EINVAL);
1708  }
1709  mp->m_sb.sb_dblocks = lcounter;
1710  return 0;
1711  case XFS_SBS_AGCOUNT:
1712  scounter = mp->m_sb.sb_agcount;
1713  scounter += delta;
1714  if (scounter < 0) {
1715  ASSERT(0);
1716  return XFS_ERROR(EINVAL);
1717  }
1718  mp->m_sb.sb_agcount = scounter;
1719  return 0;
1720  case XFS_SBS_IMAX_PCT:
1721  scounter = mp->m_sb.sb_imax_pct;
1722  scounter += delta;
1723  if (scounter < 0) {
1724  ASSERT(0);
1725  return XFS_ERROR(EINVAL);
1726  }
1727  mp->m_sb.sb_imax_pct = scounter;
1728  return 0;
1729  case XFS_SBS_REXTSIZE:
1730  scounter = mp->m_sb.sb_rextsize;
1731  scounter += delta;
1732  if (scounter < 0) {
1733  ASSERT(0);
1734  return XFS_ERROR(EINVAL);
1735  }
1736  mp->m_sb.sb_rextsize = scounter;
1737  return 0;
1738  case XFS_SBS_RBMBLOCKS:
1739  scounter = mp->m_sb.sb_rbmblocks;
1740  scounter += delta;
1741  if (scounter < 0) {
1742  ASSERT(0);
1743  return XFS_ERROR(EINVAL);
1744  }
1745  mp->m_sb.sb_rbmblocks = scounter;
1746  return 0;
1747  case XFS_SBS_RBLOCKS:
1748  lcounter = (long long)mp->m_sb.sb_rblocks;
1749  lcounter += delta;
1750  if (lcounter < 0) {
1751  ASSERT(0);
1752  return XFS_ERROR(EINVAL);
1753  }
1754  mp->m_sb.sb_rblocks = lcounter;
1755  return 0;
1756  case XFS_SBS_REXTENTS:
1757  lcounter = (long long)mp->m_sb.sb_rextents;
1758  lcounter += delta;
1759  if (lcounter < 0) {
1760  ASSERT(0);
1761  return XFS_ERROR(EINVAL);
1762  }
1763  mp->m_sb.sb_rextents = lcounter;
1764  return 0;
1765  case XFS_SBS_REXTSLOG:
1766  scounter = mp->m_sb.sb_rextslog;
1767  scounter += delta;
1768  if (scounter < 0) {
1769  ASSERT(0);
1770  return XFS_ERROR(EINVAL);
1771  }
1772  mp->m_sb.sb_rextslog = scounter;
1773  return 0;
1774  default:
1775  ASSERT(0);
1776  return XFS_ERROR(EINVAL);
1777  }
1778 }
1779 
1780 /*
1781  * xfs_mod_incore_sb() is used to change a field in the in-core
1782  * superblock structure by the specified delta. This modification
1783  * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked()
1784  * routine to do the work.
1785  */
1786 int
1788  struct xfs_mount *mp,
1790  int64_t delta,
1791  int rsvd)
1792 {
1793  int status;
1794 
1795 #ifdef HAVE_PERCPU_SB
1797 #endif
1798  spin_lock(&mp->m_sb_lock);
1799  status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1800  spin_unlock(&mp->m_sb_lock);
1801 
1802  return status;
1803 }
1804 
1805 /*
1806  * Change more than one field in the in-core superblock structure at a time.
1807  *
1808  * The fields and changes to those fields are specified in the array of
1809  * xfs_mod_sb structures passed in. Either all of the specified deltas
1810  * will be applied or none of them will. If any modified field dips below 0,
1811  * then all modifications will be backed out and EINVAL will be returned.
1812  *
1813  * Note that this function may not be used for the superblock values that
1814  * are tracked with the in-memory per-cpu counters - a direct call to
1815  * xfs_icsb_modify_counters is required for these.
1816  */
1817 int
1819  struct xfs_mount *mp,
1820  xfs_mod_sb_t *msb,
1821  uint nmsb,
1822  int rsvd)
1823 {
1824  xfs_mod_sb_t *msbp;
1825  int error = 0;
1826 
1827  /*
1828  * Loop through the array of mod structures and apply each individually.
1829  * If any fail, then back out all those which have already been applied.
1830  * Do all of this within the scope of the m_sb_lock so that all of the
1831  * changes will be atomic.
1832  */
1833  spin_lock(&mp->m_sb_lock);
1834  for (msbp = msb; msbp < (msb + nmsb); msbp++) {
1835  ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1836  msbp->msb_field > XFS_SBS_FDBLOCKS);
1837 
1838  error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1839  msbp->msb_delta, rsvd);
1840  if (error)
1841  goto unwind;
1842  }
1843  spin_unlock(&mp->m_sb_lock);
1844  return 0;
1845 
1846 unwind:
1847  while (--msbp >= msb) {
1848  error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1849  -msbp->msb_delta, rsvd);
1850  ASSERT(error == 0);
1851  }
1852  spin_unlock(&mp->m_sb_lock);
1853  return error;
1854 }
1855 
1856 /*
1857  * xfs_getsb() is called to obtain the buffer for the superblock.
1858  * The buffer is returned locked and read in from disk.
1859  * The buffer should be released with a call to xfs_brelse().
1860  *
1861  * If the flags parameter is BUF_TRYLOCK, then we'll only return
1862  * the superblock buffer if it can be locked without sleeping.
1863  * If it can't then we'll return NULL.
1864  */
1865 struct xfs_buf *
1867  struct xfs_mount *mp,
1868  int flags)
1869 {
1870  struct xfs_buf *bp = mp->m_sb_bp;
1871 
1872  if (!xfs_buf_trylock(bp)) {
1873  if (flags & XBF_TRYLOCK)
1874  return NULL;
1875  xfs_buf_lock(bp);
1876  }
1877 
1878  xfs_buf_hold(bp);
1879  ASSERT(XFS_BUF_ISDONE(bp));
1880  return bp;
1881 }
1882 
1883 /*
1884  * Used to free the superblock along various error paths.
1885  */
1886 void
1888  struct xfs_mount *mp)
1889 {
1890  struct xfs_buf *bp = mp->m_sb_bp;
1891 
1892  xfs_buf_lock(bp);
1893  mp->m_sb_bp = NULL;
1894  xfs_buf_relse(bp);
1895 }
1896 
1897 /*
1898  * Used to log changes to the superblock unit and width fields which could
1899  * be altered by the mount options, as well as any potential sb_features2
1900  * fixup. Only the first superblock is updated.
1901  */
1902 int
1904  xfs_mount_t *mp,
1905  __int64_t fields)
1906 {
1907  xfs_trans_t *tp;
1908  int error;
1909 
1910  ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
1913 
1915  error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1917  if (error) {
1918  xfs_trans_cancel(tp, 0);
1919  return error;
1920  }
1921  xfs_mod_sb(tp, fields);
1922  error = xfs_trans_commit(tp, 0);
1923  return error;
1924 }
1925 
1926 /*
1927  * If the underlying (data/log/rt) device is readonly, there are some
1928  * operations that cannot proceed.
1929  */
1930 int
1932  struct xfs_mount *mp,
1933  char *message)
1934 {
1935  if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
1936  xfs_readonly_buftarg(mp->m_logdev_targp) ||
1937  (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1938  xfs_notice(mp, "%s required on read-only device.", message);
1939  xfs_notice(mp, "write access unavailable, cannot proceed.");
1940  return EROFS;
1941  }
1942  return 0;
1943 }
1944 
1945 #ifdef HAVE_PERCPU_SB
1946 /*
1947  * Per-cpu incore superblock counters
1948  *
1949  * Simple concept, difficult implementation
1950  *
1951  * Basically, replace the incore superblock counters with a distributed per cpu
1952  * counter for contended fields (e.g. free block count).
1953  *
1954  * Difficulties arise in that the incore sb is used for ENOSPC checking, and
1955  * hence needs to be accurately read when we are running low on space. Hence
1956  * there is a method to enable and disable the per-cpu counters based on how
1957  * much "stuff" is available in them.
1958  *
1959  * Basically, a counter is enabled if there is enough free resource to justify
1960  * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
1961  * ENOSPC), then we disable the counters to synchronise all callers and
1962  * re-distribute the available resources.
1963  *
1964  * If, once we redistributed the available resources, we still get a failure,
1965  * we disable the per-cpu counter and go through the slow path.
1966  *
1967  * The slow path is the current xfs_mod_incore_sb() function. This means that
1968  * when we disable a per-cpu counter, we need to drain its resources back to
1969  * the global superblock. We do this after disabling the counter to prevent
1970  * more threads from queueing up on the counter.
1971  *
1972  * Essentially, this means that we still need a lock in the fast path to enable
1973  * synchronisation between the global counters and the per-cpu counters. This
1974  * is not a problem because the lock will be local to a CPU almost all the time
1975  * and have little contention except when we get to ENOSPC conditions.
1976  *
1977  * Basically, this lock becomes a barrier that enables us to lock out the fast
1978  * path while we do things like enabling and disabling counters and
1979  * synchronising the counters.
1980  *
1981  * Locking rules:
1982  *
1983  * 1. m_sb_lock before picking up per-cpu locks
1984  * 2. per-cpu locks always picked up via for_each_online_cpu() order
1985  * 3. accurate counter sync requires m_sb_lock + per cpu locks
1986  * 4. modifying per-cpu counters requires holding per-cpu lock
1987  * 5. modifying global counters requires holding m_sb_lock
1988  * 6. enabling or disabling a counter requires holding the m_sb_lock
1989  * and _none_ of the per-cpu locks.
1990  *
1991  * Disabled counters are only ever re-enabled by a balance operation
1992  * that results in more free resources per CPU than a given threshold.
1993  * To ensure counters don't remain disabled, they are rebalanced when
1994  * the global resource goes above a higher threshold (i.e. some hysteresis
1995  * is present to prevent thrashing).
1996  */
1997 
1998 #ifdef CONFIG_HOTPLUG_CPU
1999 /*
2000  * hot-plug CPU notifier support.
2001  *
2002  * We need a notifier per filesystem as we need to be able to identify
2003  * the filesystem to balance the counters out. This is achieved by
2004  * having a notifier block embedded in the xfs_mount_t and doing pointer
2005  * magic to get the mount pointer from the notifier block address.
2006  */
2007 STATIC int
2008 xfs_icsb_cpu_notify(
2009  struct notifier_block *nfb,
2010  unsigned long action,
2011  void *hcpu)
2012 {
2013  xfs_icsb_cnts_t *cntp;
2014  xfs_mount_t *mp;
2015 
2016  mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
2017  cntp = (xfs_icsb_cnts_t *)
2018  per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
2019  switch (action) {
2020  case CPU_UP_PREPARE:
2021  case CPU_UP_PREPARE_FROZEN:
2022  /* Easy Case - initialize the area and locks, and
2023  * then rebalance when online does everything else for us. */
2024  memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2025  break;
2026  case CPU_ONLINE:
2027  case CPU_ONLINE_FROZEN:
2028  xfs_icsb_lock(mp);
2032  xfs_icsb_unlock(mp);
2033  break;
2034  case CPU_DEAD:
2035  case CPU_DEAD_FROZEN:
2036  /* Disable all the counters, then fold the dead cpu's
2037  * count into the total on the global superblock and
2038  * re-enable the counters. */
2039  xfs_icsb_lock(mp);
2040  spin_lock(&mp->m_sb_lock);
2041  xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
2042  xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
2043  xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
2044 
2045  mp->m_sb.sb_icount += cntp->icsb_icount;
2046  mp->m_sb.sb_ifree += cntp->icsb_ifree;
2047  mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
2048 
2049  memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2050 
2054  spin_unlock(&mp->m_sb_lock);
2055  xfs_icsb_unlock(mp);
2056  break;
2057  }
2058 
2059  return NOTIFY_OK;
2060 }
2061 #endif /* CONFIG_HOTPLUG_CPU */
2062 
2063 int
2064 xfs_icsb_init_counters(
2065  xfs_mount_t *mp)
2066 {
2067  xfs_icsb_cnts_t *cntp;
2068  int i;
2069 
2070  mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
2071  if (mp->m_sb_cnts == NULL)
2072  return -ENOMEM;
2073 
2074 #ifdef CONFIG_HOTPLUG_CPU
2075  mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
2076  mp->m_icsb_notifier.priority = 0;
2077  register_hotcpu_notifier(&mp->m_icsb_notifier);
2078 #endif /* CONFIG_HOTPLUG_CPU */
2079 
2080  for_each_online_cpu(i) {
2081  cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2082  memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2083  }
2084 
2085  mutex_init(&mp->m_icsb_mutex);
2086 
2087  /*
2088  * start with all counters disabled so that the
2089  * initial balance kicks us off correctly
2090  */
2091  mp->m_icsb_counters = -1;
2092  return 0;
2093 }
2094 
2095 void
2096 xfs_icsb_reinit_counters(
2097  xfs_mount_t *mp)
2098 {
2099  xfs_icsb_lock(mp);
2100  /*
2101  * start with all counters disabled so that the
2102  * initial balance kicks us off correctly
2103  */
2104  mp->m_icsb_counters = -1;
2108  xfs_icsb_unlock(mp);
2109 }
2110 
2111 void
2112 xfs_icsb_destroy_counters(
2113  xfs_mount_t *mp)
2114 {
2115  if (mp->m_sb_cnts) {
2116  unregister_hotcpu_notifier(&mp->m_icsb_notifier);
2117  free_percpu(mp->m_sb_cnts);
2118  }
2119  mutex_destroy(&mp->m_icsb_mutex);
2120 }
2121 
2122 STATIC void
2123 xfs_icsb_lock_cntr(
2124  xfs_icsb_cnts_t *icsbp)
2125 {
2126  while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
2127  ndelay(1000);
2128  }
2129 }
2130 
2131 STATIC void
2132 xfs_icsb_unlock_cntr(
2133  xfs_icsb_cnts_t *icsbp)
2134 {
2135  clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
2136 }
2137 
2138 
2139 STATIC void
2140 xfs_icsb_lock_all_counters(
2141  xfs_mount_t *mp)
2142 {
2143  xfs_icsb_cnts_t *cntp;
2144  int i;
2145 
2146  for_each_online_cpu(i) {
2147  cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2148  xfs_icsb_lock_cntr(cntp);
2149  }
2150 }
2151 
2152 STATIC void
2153 xfs_icsb_unlock_all_counters(
2154  xfs_mount_t *mp)
2155 {
2156  xfs_icsb_cnts_t *cntp;
2157  int i;
2158 
2159  for_each_online_cpu(i) {
2160  cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2161  xfs_icsb_unlock_cntr(cntp);
2162  }
2163 }
2164 
2165 STATIC void
2166 xfs_icsb_count(
2167  xfs_mount_t *mp,
2168  xfs_icsb_cnts_t *cnt,
2169  int flags)
2170 {
2171  xfs_icsb_cnts_t *cntp;
2172  int i;
2173 
2174  memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
2175 
2176  if (!(flags & XFS_ICSB_LAZY_COUNT))
2177  xfs_icsb_lock_all_counters(mp);
2178 
2179  for_each_online_cpu(i) {
2180  cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2181  cnt->icsb_icount += cntp->icsb_icount;
2182  cnt->icsb_ifree += cntp->icsb_ifree;
2183  cnt->icsb_fdblocks += cntp->icsb_fdblocks;
2184  }
2185 
2186  if (!(flags & XFS_ICSB_LAZY_COUNT))
2187  xfs_icsb_unlock_all_counters(mp);
2188 }
2189 
2190 STATIC int
2191 xfs_icsb_counter_disabled(
2192  xfs_mount_t *mp,
2194 {
2195  ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2196  return test_bit(field, &mp->m_icsb_counters);
2197 }
2198 
2199 STATIC void
2200 xfs_icsb_disable_counter(
2201  xfs_mount_t *mp,
2202  xfs_sb_field_t field)
2203 {
2204  xfs_icsb_cnts_t cnt;
2205 
2206  ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2207 
2208  /*
2209  * If we are already disabled, then there is nothing to do
2210  * here. We check before locking all the counters to avoid
2211  * the expensive lock operation when being called in the
2212  * slow path and the counter is already disabled. This is
2213  * safe because the only time we set or clear this state is under
2214  * the m_icsb_mutex.
2215  */
2216  if (xfs_icsb_counter_disabled(mp, field))
2217  return;
2218 
2219  xfs_icsb_lock_all_counters(mp);
2220  if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
2221  /* drain back to superblock */
2222 
2223  xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
2224  switch(field) {
2225  case XFS_SBS_ICOUNT:
2226  mp->m_sb.sb_icount = cnt.icsb_icount;
2227  break;
2228  case XFS_SBS_IFREE:
2229  mp->m_sb.sb_ifree = cnt.icsb_ifree;
2230  break;
2231  case XFS_SBS_FDBLOCKS:
2232  mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2233  break;
2234  default:
2235  BUG();
2236  }
2237  }
2238 
2239  xfs_icsb_unlock_all_counters(mp);
2240 }
2241 
2242 STATIC void
2243 xfs_icsb_enable_counter(
2244  xfs_mount_t *mp,
2245  xfs_sb_field_t field,
2246  uint64_t count,
2247  uint64_t resid)
2248 {
2249  xfs_icsb_cnts_t *cntp;
2250  int i;
2251 
2252  ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2253 
2254  xfs_icsb_lock_all_counters(mp);
2255  for_each_online_cpu(i) {
2256  cntp = per_cpu_ptr(mp->m_sb_cnts, i);
2257  switch (field) {
2258  case XFS_SBS_ICOUNT:
2259  cntp->icsb_icount = count + resid;
2260  break;
2261  case XFS_SBS_IFREE:
2262  cntp->icsb_ifree = count + resid;
2263  break;
2264  case XFS_SBS_FDBLOCKS:
2265  cntp->icsb_fdblocks = count + resid;
2266  break;
2267  default:
2268  BUG();
2269  break;
2270  }
2271  resid = 0;
2272  }
2273  clear_bit(field, &mp->m_icsb_counters);
2274  xfs_icsb_unlock_all_counters(mp);
2275 }
2276 
2277 void
2278 xfs_icsb_sync_counters_locked(
2279  xfs_mount_t *mp,
2280  int flags)
2281 {
2282  xfs_icsb_cnts_t cnt;
2283 
2284  xfs_icsb_count(mp, &cnt, flags);
2285 
2286  if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
2287  mp->m_sb.sb_icount = cnt.icsb_icount;
2288  if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
2289  mp->m_sb.sb_ifree = cnt.icsb_ifree;
2290  if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
2291  mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2292 }
2293 
2294 /*
2295  * Accurate update of per-cpu counters to incore superblock
2296  */
2297 void
2298 xfs_icsb_sync_counters(
2299  xfs_mount_t *mp,
2300  int flags)
2301 {
2302  spin_lock(&mp->m_sb_lock);
2303  xfs_icsb_sync_counters_locked(mp, flags);
2304  spin_unlock(&mp->m_sb_lock);
2305 }
2306 
2307 /*
2308  * Balance and enable/disable counters as necessary.
2309  *
2310  * Thresholds for re-enabling counters are somewhat magic. inode counts are
2311  * chosen to be the same number as single on disk allocation chunk per CPU, and
2312  * free blocks is something far enough zero that we aren't going thrash when we
2313  * get near ENOSPC. We also need to supply a minimum we require per cpu to
2314  * prevent looping endlessly when xfs_alloc_space asks for more than will
2315  * be distributed to a single CPU but each CPU has enough blocks to be
2316  * reenabled.
2317  *
2318  * Note that we can be called when counters are already disabled.
2319  * xfs_icsb_disable_counter() optimises the counter locking in this case to
2320  * prevent locking every per-cpu counter needlessly.
2321  */
2322 
2323 #define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64
2324 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
2325  (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
2326 STATIC void
2328  xfs_mount_t *mp,
2329  xfs_sb_field_t field,
2330  int min_per_cpu)
2331 {
2332  uint64_t count, resid;
2333  int weight = num_online_cpus();
2334  uint64_t min = (uint64_t)min_per_cpu;
2335 
2336  /* disable counter and sync counter */
2337  xfs_icsb_disable_counter(mp, field);
2338 
2339  /* update counters - first CPU gets residual*/
2340  switch (field) {
2341  case XFS_SBS_ICOUNT:
2342  count = mp->m_sb.sb_icount;
2343  resid = do_div(count, weight);
2344  if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2345  return;
2346  break;
2347  case XFS_SBS_IFREE:
2348  count = mp->m_sb.sb_ifree;
2349  resid = do_div(count, weight);
2350  if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2351  return;
2352  break;
2353  case XFS_SBS_FDBLOCKS:
2354  count = mp->m_sb.sb_fdblocks;
2355  resid = do_div(count, weight);
2356  if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
2357  return;
2358  break;
2359  default:
2360  BUG();
2361  count = resid = 0; /* quiet, gcc */
2362  break;
2363  }
2364 
2365  xfs_icsb_enable_counter(mp, field, count, resid);
2366 }
2367 
2368 STATIC void
2370  xfs_mount_t *mp,
2372  int min_per_cpu)
2373 {
2374  spin_lock(&mp->m_sb_lock);
2375  xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
2376  spin_unlock(&mp->m_sb_lock);
2377 }
2378 
2379 int
2380 xfs_icsb_modify_counters(
2381  xfs_mount_t *mp,
2382  xfs_sb_field_t field,
2383  int64_t delta,
2384  int rsvd)
2385 {
2386  xfs_icsb_cnts_t *icsbp;
2387  long long lcounter; /* long counter for 64 bit fields */
2388  int ret = 0;
2389 
2390  might_sleep();
2391 again:
2392  preempt_disable();
2393  icsbp = this_cpu_ptr(mp->m_sb_cnts);
2394 
2395  /*
2396  * if the counter is disabled, go to slow path
2397  */
2398  if (unlikely(xfs_icsb_counter_disabled(mp, field)))
2399  goto slow_path;
2400  xfs_icsb_lock_cntr(icsbp);
2401  if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
2402  xfs_icsb_unlock_cntr(icsbp);
2403  goto slow_path;
2404  }
2405 
2406  switch (field) {
2407  case XFS_SBS_ICOUNT:
2408  lcounter = icsbp->icsb_icount;
2409  lcounter += delta;
2410  if (unlikely(lcounter < 0))
2411  goto balance_counter;
2412  icsbp->icsb_icount = lcounter;
2413  break;
2414 
2415  case XFS_SBS_IFREE:
2416  lcounter = icsbp->icsb_ifree;
2417  lcounter += delta;
2418  if (unlikely(lcounter < 0))
2419  goto balance_counter;
2420  icsbp->icsb_ifree = lcounter;
2421  break;
2422 
2423  case XFS_SBS_FDBLOCKS:
2424  BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2425 
2426  lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
2427  lcounter += delta;
2428  if (unlikely(lcounter < 0))
2429  goto balance_counter;
2430  icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
2431  break;
2432  default:
2433  BUG();
2434  break;
2435  }
2436  xfs_icsb_unlock_cntr(icsbp);
2437  preempt_enable();
2438  return 0;
2439 
2440 slow_path:
2441  preempt_enable();
2442 
2443  /*
2444  * serialise with a mutex so we don't burn lots of cpu on
2445  * the superblock lock. We still need to hold the superblock
2446  * lock, however, when we modify the global structures.
2447  */
2448  xfs_icsb_lock(mp);
2449 
2450  /*
2451  * Now running atomically.
2452  *
2453  * If the counter is enabled, someone has beaten us to rebalancing.
2454  * Drop the lock and try again in the fast path....
2455  */
2456  if (!(xfs_icsb_counter_disabled(mp, field))) {
2457  xfs_icsb_unlock(mp);
2458  goto again;
2459  }
2460 
2461  /*
2462  * The counter is currently disabled. Because we are
2463  * running atomically here, we know a rebalance cannot
2464  * be in progress. Hence we can go straight to operating
2465  * on the global superblock. We do not call xfs_mod_incore_sb()
2466  * here even though we need to get the m_sb_lock. Doing so
2467  * will cause us to re-enter this function and deadlock.
2468  * Hence we get the m_sb_lock ourselves and then call
2469  * xfs_mod_incore_sb_unlocked() as the unlocked path operates
2470  * directly on the global counters.
2471  */
2472  spin_lock(&mp->m_sb_lock);
2473  ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
2474  spin_unlock(&mp->m_sb_lock);
2475 
2476  /*
2477  * Now that we've modified the global superblock, we
2478  * may be able to re-enable the distributed counters
2479  * (e.g. lots of space just got freed). After that
2480  * we are done.
2481  */
2482  if (ret != ENOSPC)
2483  xfs_icsb_balance_counter(mp, field, 0);
2484  xfs_icsb_unlock(mp);
2485  return ret;
2486 
2487 balance_counter:
2488  xfs_icsb_unlock_cntr(icsbp);
2489  preempt_enable();
2490 
2491  /*
2492  * We may have multiple threads here if multiple per-cpu
2493  * counters run dry at the same time. This will mean we can
2494  * do more balances than strictly necessary but it is not
2495  * the common slowpath case.
2496  */
2497  xfs_icsb_lock(mp);
2498 
2499  /*
2500  * running atomically.
2501  *
2502  * This will leave the counter in the correct state for future
2503  * accesses. After the rebalance, we simply try again and our retry
2504  * will either succeed through the fast path or slow path without
2505  * another balance operation being required.
2506  */
2507  xfs_icsb_balance_counter(mp, field, delta);
2508  xfs_icsb_unlock(mp);
2509  goto again;
2510 }
2511 
2512 #endif