Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xfs_log_recover.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_trans.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_mount.h"
28 #include "xfs_error.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_alloc_btree.h"
31 #include "xfs_ialloc_btree.h"
32 #include "xfs_dinode.h"
33 #include "xfs_inode.h"
34 #include "xfs_inode_item.h"
35 #include "xfs_alloc.h"
36 #include "xfs_ialloc.h"
37 #include "xfs_log_priv.h"
38 #include "xfs_buf_item.h"
39 #include "xfs_log_recover.h"
40 #include "xfs_extfree_item.h"
41 #include "xfs_trans_priv.h"
42 #include "xfs_quota.h"
43 #include "xfs_utils.h"
44 #include "xfs_trace.h"
45 
46 STATIC int
48  struct xlog *,
49  xfs_daddr_t *);
50 STATIC int
52  struct xlog *,
53  xfs_lsn_t);
54 #if defined(DEBUG)
55 STATIC void
57  struct xlog *);
58 #else
59 #define xlog_recover_check_summary(log)
60 #endif
61 
62 /*
63  * This structure is used during recovery to record the buf log items which
64  * have been canceled and should not be replayed.
65  */
67  xfs_daddr_t bc_blkno;
71 };
72 
73 /*
74  * Sector aligned buffer routines for buffer create/read/write/access
75  */
76 
77 /*
78  * Verify the given count of basic blocks is valid number of blocks
79  * to specify for an operation involving the given XFS log buffer.
80  * Returns nonzero if the count is valid, 0 otherwise.
81  */
82 
83 static inline int
84 xlog_buf_bbcount_valid(
85  struct xlog *log,
86  int bbcount)
87 {
88  return bbcount > 0 && bbcount <= log->l_logBBsize;
89 }
90 
91 /*
92  * Allocate a buffer to hold log data. The buffer needs to be able
93  * to map to a range of nbblks basic blocks at any valid (basic
94  * block) offset within the log.
95  */
98  struct xlog *log,
99  int nbblks)
100 {
101  struct xfs_buf *bp;
102 
103  if (!xlog_buf_bbcount_valid(log, nbblks)) {
104  xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
105  nbblks);
106  XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
107  return NULL;
108  }
109 
110  /*
111  * We do log I/O in units of log sectors (a power-of-2
112  * multiple of the basic block size), so we round up the
113  * requested size to accommodate the basic blocks required
114  * for complete log sectors.
115  *
116  * In addition, the buffer may be used for a non-sector-
117  * aligned block offset, in which case an I/O of the
118  * requested size could extend beyond the end of the
119  * buffer. If the requested size is only 1 basic block it
120  * will never straddle a sector boundary, so this won't be
121  * an issue. Nor will this be a problem if the log I/O is
122  * done in basic blocks (sector size 1). But otherwise we
123  * extend the buffer by one extra log sector to ensure
124  * there's space to accommodate this possibility.
125  */
126  if (nbblks > 1 && log->l_sectBBsize > 1)
127  nbblks += log->l_sectBBsize;
128  nbblks = round_up(nbblks, log->l_sectBBsize);
129 
130  bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
131  if (bp)
132  xfs_buf_unlock(bp);
133  return bp;
134 }
135 
136 STATIC void
138  xfs_buf_t *bp)
139 {
140  xfs_buf_free(bp);
141 }
142 
143 /*
144  * Return the address of the start of the given block number's data
145  * in a log buffer. The buffer covers a log sector-aligned region.
146  */
147 STATIC xfs_caddr_t
149  struct xlog *log,
150  xfs_daddr_t blk_no,
151  int nbblks,
152  struct xfs_buf *bp)
153 {
154  xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
155 
156  ASSERT(offset + nbblks <= bp->b_length);
157  return bp->b_addr + BBTOB(offset);
158 }
159 
160 
161 /*
162  * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
163  */
164 STATIC int
166  struct xlog *log,
167  xfs_daddr_t blk_no,
168  int nbblks,
169  struct xfs_buf *bp)
170 {
171  int error;
172 
173  if (!xlog_buf_bbcount_valid(log, nbblks)) {
174  xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
175  nbblks);
176  XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
177  return EFSCORRUPTED;
178  }
179 
180  blk_no = round_down(blk_no, log->l_sectBBsize);
181  nbblks = round_up(nbblks, log->l_sectBBsize);
182 
183  ASSERT(nbblks > 0);
184  ASSERT(nbblks <= bp->b_length);
185 
186  XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
187  XFS_BUF_READ(bp);
188  bp->b_io_length = nbblks;
189  bp->b_error = 0;
190 
191  xfsbdstrat(log->l_mp, bp);
192  error = xfs_buf_iowait(bp);
193  if (error)
194  xfs_buf_ioerror_alert(bp, __func__);
195  return error;
196 }
197 
198 STATIC int
200  struct xlog *log,
201  xfs_daddr_t blk_no,
202  int nbblks,
203  struct xfs_buf *bp,
204  xfs_caddr_t *offset)
205 {
206  int error;
207 
208  error = xlog_bread_noalign(log, blk_no, nbblks, bp);
209  if (error)
210  return error;
211 
212  *offset = xlog_align(log, blk_no, nbblks, bp);
213  return 0;
214 }
215 
216 /*
217  * Read at an offset into the buffer. Returns with the buffer in it's original
218  * state regardless of the result of the read.
219  */
220 STATIC int
222  struct xlog *log,
223  xfs_daddr_t blk_no, /* block to read from */
224  int nbblks, /* blocks to read */
225  struct xfs_buf *bp,
226  xfs_caddr_t offset)
227 {
228  xfs_caddr_t orig_offset = bp->b_addr;
229  int orig_len = BBTOB(bp->b_length);
230  int error, error2;
231 
232  error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
233  if (error)
234  return error;
235 
236  error = xlog_bread_noalign(log, blk_no, nbblks, bp);
237 
238  /* must reset buffer pointer even on error */
239  error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
240  if (error)
241  return error;
242  return error2;
243 }
244 
245 /*
246  * Write out the buffer at the given block for the given number of blocks.
247  * The buffer is kept locked across the write and is returned locked.
248  * This can only be used for synchronous log writes.
249  */
250 STATIC int
252  struct xlog *log,
253  xfs_daddr_t blk_no,
254  int nbblks,
255  struct xfs_buf *bp)
256 {
257  int error;
258 
259  if (!xlog_buf_bbcount_valid(log, nbblks)) {
260  xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
261  nbblks);
262  XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
263  return EFSCORRUPTED;
264  }
265 
266  blk_no = round_down(blk_no, log->l_sectBBsize);
267  nbblks = round_up(nbblks, log->l_sectBBsize);
268 
269  ASSERT(nbblks > 0);
270  ASSERT(nbblks <= bp->b_length);
271 
272  XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
273  XFS_BUF_ZEROFLAGS(bp);
274  xfs_buf_hold(bp);
275  xfs_buf_lock(bp);
276  bp->b_io_length = nbblks;
277  bp->b_error = 0;
278 
279  error = xfs_bwrite(bp);
280  if (error)
281  xfs_buf_ioerror_alert(bp, __func__);
282  xfs_buf_relse(bp);
283  return error;
284 }
285 
286 #ifdef DEBUG
287 /*
288  * dump debug superblock and log record information
289  */
290 STATIC void
292  xfs_mount_t *mp,
294 {
295  xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
296  __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
297  xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
298  &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
299 }
300 #else
301 #define xlog_header_check_dump(mp, head)
302 #endif
303 
304 /*
305  * check log record header for recovery
306  */
307 STATIC int
309  xfs_mount_t *mp,
310  xlog_rec_header_t *head)
311 {
313 
314  /*
315  * IRIX doesn't write the h_fmt field and leaves it zeroed
316  * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
317  * a dirty log created in IRIX.
318  */
319  if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
320  xfs_warn(mp,
321  "dirty log written in incompatible format - can't recover");
322  xlog_header_check_dump(mp, head);
323  XFS_ERROR_REPORT("xlog_header_check_recover(1)",
324  XFS_ERRLEVEL_HIGH, mp);
325  return XFS_ERROR(EFSCORRUPTED);
326  } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
327  xfs_warn(mp,
328  "dirty log entry has mismatched uuid - can't recover");
329  xlog_header_check_dump(mp, head);
330  XFS_ERROR_REPORT("xlog_header_check_recover(2)",
331  XFS_ERRLEVEL_HIGH, mp);
332  return XFS_ERROR(EFSCORRUPTED);
333  }
334  return 0;
335 }
336 
337 /*
338  * read the head block of the log and check the header
339  */
340 STATIC int
342  xfs_mount_t *mp,
343  xlog_rec_header_t *head)
344 {
346 
347  if (uuid_is_nil(&head->h_fs_uuid)) {
348  /*
349  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
350  * h_fs_uuid is nil, we assume this log was last mounted
351  * by IRIX and continue.
352  */
353  xfs_warn(mp, "nil uuid in log - IRIX style log");
354  } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
355  xfs_warn(mp, "log has mismatched uuid - can't recover");
356  xlog_header_check_dump(mp, head);
357  XFS_ERROR_REPORT("xlog_header_check_mount",
358  XFS_ERRLEVEL_HIGH, mp);
359  return XFS_ERROR(EFSCORRUPTED);
360  }
361  return 0;
362 }
363 
364 STATIC void
366  struct xfs_buf *bp)
367 {
368  if (bp->b_error) {
369  /*
370  * We're not going to bother about retrying
371  * this during recovery. One strike!
372  */
373  xfs_buf_ioerror_alert(bp, __func__);
374  xfs_force_shutdown(bp->b_target->bt_mount,
375  SHUTDOWN_META_IO_ERROR);
376  }
377  bp->b_iodone = NULL;
378  xfs_buf_ioend(bp, 0);
379 }
380 
381 /*
382  * This routine finds (to an approximation) the first block in the physical
383  * log which contains the given cycle. It uses a binary search algorithm.
384  * Note that the algorithm can not be perfect because the disk will not
385  * necessarily be perfect.
386  */
387 STATIC int
389  struct xlog *log,
390  struct xfs_buf *bp,
391  xfs_daddr_t first_blk,
392  xfs_daddr_t *last_blk,
393  uint cycle)
394 {
395  xfs_caddr_t offset;
396  xfs_daddr_t mid_blk;
397  xfs_daddr_t end_blk;
398  uint mid_cycle;
399  int error;
400 
401  end_blk = *last_blk;
402  mid_blk = BLK_AVG(first_blk, end_blk);
403  while (mid_blk != first_blk && mid_blk != end_blk) {
404  error = xlog_bread(log, mid_blk, 1, bp, &offset);
405  if (error)
406  return error;
407  mid_cycle = xlog_get_cycle(offset);
408  if (mid_cycle == cycle)
409  end_blk = mid_blk; /* last_half_cycle == mid_cycle */
410  else
411  first_blk = mid_blk; /* first_half_cycle == mid_cycle */
412  mid_blk = BLK_AVG(first_blk, end_blk);
413  }
414  ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
415  (mid_blk == end_blk && mid_blk-1 == first_blk));
416 
417  *last_blk = end_blk;
418 
419  return 0;
420 }
421 
422 /*
423  * Check that a range of blocks does not contain stop_on_cycle_no.
424  * Fill in *new_blk with the block offset where such a block is
425  * found, or with -1 (an invalid block number) if there is no such
426  * block in the range. The scan needs to occur from front to back
427  * and the pointer into the region must be updated since a later
428  * routine will need to perform another test.
429  */
430 STATIC int
432  struct xlog *log,
433  xfs_daddr_t start_blk,
434  int nbblks,
435  uint stop_on_cycle_no,
436  xfs_daddr_t *new_blk)
437 {
438  xfs_daddr_t i, j;
439  uint cycle;
440  xfs_buf_t *bp;
441  xfs_daddr_t bufblks;
442  xfs_caddr_t buf = NULL;
443  int error = 0;
444 
445  /*
446  * Greedily allocate a buffer big enough to handle the full
447  * range of basic blocks we'll be examining. If that fails,
448  * try a smaller size. We need to be able to read at least
449  * a log sector, or we're out of luck.
450  */
451  bufblks = 1 << ffs(nbblks);
452  while (bufblks > log->l_logBBsize)
453  bufblks >>= 1;
454  while (!(bp = xlog_get_bp(log, bufblks))) {
455  bufblks >>= 1;
456  if (bufblks < log->l_sectBBsize)
457  return ENOMEM;
458  }
459 
460  for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
461  int bcount;
462 
463  bcount = min(bufblks, (start_blk + nbblks - i));
464 
465  error = xlog_bread(log, i, bcount, bp, &buf);
466  if (error)
467  goto out;
468 
469  for (j = 0; j < bcount; j++) {
470  cycle = xlog_get_cycle(buf);
471  if (cycle == stop_on_cycle_no) {
472  *new_blk = i+j;
473  goto out;
474  }
475 
476  buf += BBSIZE;
477  }
478  }
479 
480  *new_blk = -1;
481 
482 out:
483  xlog_put_bp(bp);
484  return error;
485 }
486 
487 /*
488  * Potentially backup over partial log record write.
489  *
490  * In the typical case, last_blk is the number of the block directly after
491  * a good log record. Therefore, we subtract one to get the block number
492  * of the last block in the given buffer. extra_bblks contains the number
493  * of blocks we would have read on a previous read. This happens when the
494  * last log record is split over the end of the physical log.
495  *
496  * extra_bblks is the number of blocks potentially verified on a previous
497  * call to this routine.
498  */
499 STATIC int
501  struct xlog *log,
502  xfs_daddr_t start_blk,
503  xfs_daddr_t *last_blk,
504  int extra_bblks)
505 {
506  xfs_daddr_t i;
507  xfs_buf_t *bp;
508  xfs_caddr_t offset = NULL;
509  xlog_rec_header_t *head = NULL;
510  int error = 0;
511  int smallmem = 0;
512  int num_blks = *last_blk - start_blk;
513  int xhdrs;
514 
515  ASSERT(start_blk != 0 || *last_blk != start_blk);
516 
517  if (!(bp = xlog_get_bp(log, num_blks))) {
518  if (!(bp = xlog_get_bp(log, 1)))
519  return ENOMEM;
520  smallmem = 1;
521  } else {
522  error = xlog_bread(log, start_blk, num_blks, bp, &offset);
523  if (error)
524  goto out;
525  offset += ((num_blks - 1) << BBSHIFT);
526  }
527 
528  for (i = (*last_blk) - 1; i >= 0; i--) {
529  if (i < start_blk) {
530  /* valid log record not found */
531  xfs_warn(log->l_mp,
532  "Log inconsistent (didn't find previous header)");
533  ASSERT(0);
534  error = XFS_ERROR(EIO);
535  goto out;
536  }
537 
538  if (smallmem) {
539  error = xlog_bread(log, i, 1, bp, &offset);
540  if (error)
541  goto out;
542  }
543 
544  head = (xlog_rec_header_t *)offset;
545 
547  break;
548 
549  if (!smallmem)
550  offset -= BBSIZE;
551  }
552 
553  /*
554  * We hit the beginning of the physical log & still no header. Return
555  * to caller. If caller can handle a return of -1, then this routine
556  * will be called again for the end of the physical log.
557  */
558  if (i == -1) {
559  error = -1;
560  goto out;
561  }
562 
563  /*
564  * We have the final block of the good log (the first block
565  * of the log record _before_ the head. So we check the uuid.
566  */
567  if ((error = xlog_header_check_mount(log->l_mp, head)))
568  goto out;
569 
570  /*
571  * We may have found a log record header before we expected one.
572  * last_blk will be the 1st block # with a given cycle #. We may end
573  * up reading an entire log record. In this case, we don't want to
574  * reset last_blk. Only when last_blk points in the middle of a log
575  * record do we update last_blk.
576  */
577  if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
578  uint h_size = be32_to_cpu(head->h_size);
579 
580  xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
581  if (h_size % XLOG_HEADER_CYCLE_SIZE)
582  xhdrs++;
583  } else {
584  xhdrs = 1;
585  }
586 
587  if (*last_blk - i + extra_bblks !=
588  BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
589  *last_blk = i;
590 
591 out:
592  xlog_put_bp(bp);
593  return error;
594 }
595 
596 /*
597  * Head is defined to be the point of the log where the next log write
598  * write could go. This means that incomplete LR writes at the end are
599  * eliminated when calculating the head. We aren't guaranteed that previous
600  * LR have complete transactions. We only know that a cycle number of
601  * current cycle number -1 won't be present in the log if we start writing
602  * from our current block number.
603  *
604  * last_blk contains the block number of the first block with a given
605  * cycle number.
606  *
607  * Return: zero if normal, non-zero if error.
608  */
609 STATIC int
611  struct xlog *log,
612  xfs_daddr_t *return_head_blk)
613 {
614  xfs_buf_t *bp;
615  xfs_caddr_t offset;
616  xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
617  int num_scan_bblks;
618  uint first_half_cycle, last_half_cycle;
619  uint stop_on_cycle;
620  int error, log_bbnum = log->l_logBBsize;
621 
622  /* Is the end of the log device zeroed? */
623  if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
624  *return_head_blk = first_blk;
625 
626  /* Is the whole lot zeroed? */
627  if (!first_blk) {
628  /* Linux XFS shouldn't generate totally zeroed logs -
629  * mkfs etc write a dummy unmount record to a fresh
630  * log so we can store the uuid in there
631  */
632  xfs_warn(log->l_mp, "totally zeroed log");
633  }
634 
635  return 0;
636  } else if (error) {
637  xfs_warn(log->l_mp, "empty log check failed");
638  return error;
639  }
640 
641  first_blk = 0; /* get cycle # of 1st block */
642  bp = xlog_get_bp(log, 1);
643  if (!bp)
644  return ENOMEM;
645 
646  error = xlog_bread(log, 0, 1, bp, &offset);
647  if (error)
648  goto bp_err;
649 
650  first_half_cycle = xlog_get_cycle(offset);
651 
652  last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
653  error = xlog_bread(log, last_blk, 1, bp, &offset);
654  if (error)
655  goto bp_err;
656 
657  last_half_cycle = xlog_get_cycle(offset);
658  ASSERT(last_half_cycle != 0);
659 
660  /*
661  * If the 1st half cycle number is equal to the last half cycle number,
662  * then the entire log is stamped with the same cycle number. In this
663  * case, head_blk can't be set to zero (which makes sense). The below
664  * math doesn't work out properly with head_blk equal to zero. Instead,
665  * we set it to log_bbnum which is an invalid block number, but this
666  * value makes the math correct. If head_blk doesn't changed through
667  * all the tests below, *head_blk is set to zero at the very end rather
668  * than log_bbnum. In a sense, log_bbnum and zero are the same block
669  * in a circular file.
670  */
671  if (first_half_cycle == last_half_cycle) {
672  /*
673  * In this case we believe that the entire log should have
674  * cycle number last_half_cycle. We need to scan backwards
675  * from the end verifying that there are no holes still
676  * containing last_half_cycle - 1. If we find such a hole,
677  * then the start of that hole will be the new head. The
678  * simple case looks like
679  * x | x ... | x - 1 | x
680  * Another case that fits this picture would be
681  * x | x + 1 | x ... | x
682  * In this case the head really is somewhere at the end of the
683  * log, as one of the latest writes at the beginning was
684  * incomplete.
685  * One more case is
686  * x | x + 1 | x ... | x - 1 | x
687  * This is really the combination of the above two cases, and
688  * the head has to end up at the start of the x-1 hole at the
689  * end of the log.
690  *
691  * In the 256k log case, we will read from the beginning to the
692  * end of the log and search for cycle numbers equal to x-1.
693  * We don't worry about the x+1 blocks that we encounter,
694  * because we know that they cannot be the head since the log
695  * started with x.
696  */
697  head_blk = log_bbnum;
698  stop_on_cycle = last_half_cycle - 1;
699  } else {
700  /*
701  * In this case we want to find the first block with cycle
702  * number matching last_half_cycle. We expect the log to be
703  * some variation on
704  * x + 1 ... | x ... | x
705  * The first block with cycle number x (last_half_cycle) will
706  * be where the new head belongs. First we do a binary search
707  * for the first occurrence of last_half_cycle. The binary
708  * search may not be totally accurate, so then we scan back
709  * from there looking for occurrences of last_half_cycle before
710  * us. If that backwards scan wraps around the beginning of
711  * the log, then we look for occurrences of last_half_cycle - 1
712  * at the end of the log. The cases we're looking for look
713  * like
714  * v binary search stopped here
715  * x + 1 ... | x | x + 1 | x ... | x
716  * ^ but we want to locate this spot
717  * or
718  * <---------> less than scan distance
719  * x + 1 ... | x ... | x - 1 | x
720  * ^ we want to locate this spot
721  */
722  stop_on_cycle = last_half_cycle;
723  if ((error = xlog_find_cycle_start(log, bp, first_blk,
724  &head_blk, last_half_cycle)))
725  goto bp_err;
726  }
727 
728  /*
729  * Now validate the answer. Scan back some number of maximum possible
730  * blocks and make sure each one has the expected cycle number. The
731  * maximum is determined by the total possible amount of buffering
732  * in the in-core log. The following number can be made tighter if
733  * we actually look at the block size of the filesystem.
734  */
735  num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
736  if (head_blk >= num_scan_bblks) {
737  /*
738  * We are guaranteed that the entire check can be performed
739  * in one buffer.
740  */
741  start_blk = head_blk - num_scan_bblks;
742  if ((error = xlog_find_verify_cycle(log,
743  start_blk, num_scan_bblks,
744  stop_on_cycle, &new_blk)))
745  goto bp_err;
746  if (new_blk != -1)
747  head_blk = new_blk;
748  } else { /* need to read 2 parts of log */
749  /*
750  * We are going to scan backwards in the log in two parts.
751  * First we scan the physical end of the log. In this part
752  * of the log, we are looking for blocks with cycle number
753  * last_half_cycle - 1.
754  * If we find one, then we know that the log starts there, as
755  * we've found a hole that didn't get written in going around
756  * the end of the physical log. The simple case for this is
757  * x + 1 ... | x ... | x - 1 | x
758  * <---------> less than scan distance
759  * If all of the blocks at the end of the log have cycle number
760  * last_half_cycle, then we check the blocks at the start of
761  * the log looking for occurrences of last_half_cycle. If we
762  * find one, then our current estimate for the location of the
763  * first occurrence of last_half_cycle is wrong and we move
764  * back to the hole we've found. This case looks like
765  * x + 1 ... | x | x + 1 | x ...
766  * ^ binary search stopped here
767  * Another case we need to handle that only occurs in 256k
768  * logs is
769  * x + 1 ... | x ... | x+1 | x ...
770  * ^ binary search stops here
771  * In a 256k log, the scan at the end of the log will see the
772  * x + 1 blocks. We need to skip past those since that is
773  * certainly not the head of the log. By searching for
774  * last_half_cycle-1 we accomplish that.
775  */
776  ASSERT(head_blk <= INT_MAX &&
777  (xfs_daddr_t) num_scan_bblks >= head_blk);
778  start_blk = log_bbnum - (num_scan_bblks - head_blk);
779  if ((error = xlog_find_verify_cycle(log, start_blk,
780  num_scan_bblks - (int)head_blk,
781  (stop_on_cycle - 1), &new_blk)))
782  goto bp_err;
783  if (new_blk != -1) {
784  head_blk = new_blk;
785  goto validate_head;
786  }
787 
788  /*
789  * Scan beginning of log now. The last part of the physical
790  * log is good. This scan needs to verify that it doesn't find
791  * the last_half_cycle.
792  */
793  start_blk = 0;
794  ASSERT(head_blk <= INT_MAX);
795  if ((error = xlog_find_verify_cycle(log,
796  start_blk, (int)head_blk,
797  stop_on_cycle, &new_blk)))
798  goto bp_err;
799  if (new_blk != -1)
800  head_blk = new_blk;
801  }
802 
803 validate_head:
804  /*
805  * Now we need to make sure head_blk is not pointing to a block in
806  * the middle of a log record.
807  */
808  num_scan_bblks = XLOG_REC_SHIFT(log);
809  if (head_blk >= num_scan_bblks) {
810  start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
811 
812  /* start ptr at last block ptr before head_blk */
813  if ((error = xlog_find_verify_log_record(log, start_blk,
814  &head_blk, 0)) == -1) {
815  error = XFS_ERROR(EIO);
816  goto bp_err;
817  } else if (error)
818  goto bp_err;
819  } else {
820  start_blk = 0;
821  ASSERT(head_blk <= INT_MAX);
822  if ((error = xlog_find_verify_log_record(log, start_blk,
823  &head_blk, 0)) == -1) {
824  /* We hit the beginning of the log during our search */
825  start_blk = log_bbnum - (num_scan_bblks - head_blk);
826  new_blk = log_bbnum;
827  ASSERT(start_blk <= INT_MAX &&
828  (xfs_daddr_t) log_bbnum-start_blk >= 0);
829  ASSERT(head_blk <= INT_MAX);
830  if ((error = xlog_find_verify_log_record(log,
831  start_blk, &new_blk,
832  (int)head_blk)) == -1) {
833  error = XFS_ERROR(EIO);
834  goto bp_err;
835  } else if (error)
836  goto bp_err;
837  if (new_blk != log_bbnum)
838  head_blk = new_blk;
839  } else if (error)
840  goto bp_err;
841  }
842 
843  xlog_put_bp(bp);
844  if (head_blk == log_bbnum)
845  *return_head_blk = 0;
846  else
847  *return_head_blk = head_blk;
848  /*
849  * When returning here, we have a good block number. Bad block
850  * means that during a previous crash, we didn't have a clean break
851  * from cycle number N to cycle number N-1. In this case, we need
852  * to find the first block with cycle number N-1.
853  */
854  return 0;
855 
856  bp_err:
857  xlog_put_bp(bp);
858 
859  if (error)
860  xfs_warn(log->l_mp, "failed to find log head");
861  return error;
862 }
863 
864 /*
865  * Find the sync block number or the tail of the log.
866  *
867  * This will be the block number of the last record to have its
868  * associated buffers synced to disk. Every log record header has
869  * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
870  * to get a sync block number. The only concern is to figure out which
871  * log record header to believe.
872  *
873  * The following algorithm uses the log record header with the largest
874  * lsn. The entire log record does not need to be valid. We only care
875  * that the header is valid.
876  *
877  * We could speed up search by using current head_blk buffer, but it is not
878  * available.
879  */
880 STATIC int
882  struct xlog *log,
883  xfs_daddr_t *head_blk,
884  xfs_daddr_t *tail_blk)
885 {
886  xlog_rec_header_t *rhead;
887  xlog_op_header_t *op_head;
888  xfs_caddr_t offset = NULL;
889  xfs_buf_t *bp;
890  int error, i, found;
891  xfs_daddr_t umount_data_blk;
892  xfs_daddr_t after_umount_blk;
893  xfs_lsn_t tail_lsn;
894  int hblks;
895 
896  found = 0;
897 
898  /*
899  * Find previous log record
900  */
901  if ((error = xlog_find_head(log, head_blk)))
902  return error;
903 
904  bp = xlog_get_bp(log, 1);
905  if (!bp)
906  return ENOMEM;
907  if (*head_blk == 0) { /* special case */
908  error = xlog_bread(log, 0, 1, bp, &offset);
909  if (error)
910  goto done;
911 
912  if (xlog_get_cycle(offset) == 0) {
913  *tail_blk = 0;
914  /* leave all other log inited values alone */
915  goto done;
916  }
917  }
918 
919  /*
920  * Search backwards looking for log record header block
921  */
922  ASSERT(*head_blk < INT_MAX);
923  for (i = (int)(*head_blk) - 1; i >= 0; i--) {
924  error = xlog_bread(log, i, 1, bp, &offset);
925  if (error)
926  goto done;
927 
928  if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
929  found = 1;
930  break;
931  }
932  }
933  /*
934  * If we haven't found the log record header block, start looking
935  * again from the end of the physical log. XXXmiken: There should be
936  * a check here to make sure we didn't search more than N blocks in
937  * the previous code.
938  */
939  if (!found) {
940  for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
941  error = xlog_bread(log, i, 1, bp, &offset);
942  if (error)
943  goto done;
944 
945  if (*(__be32 *)offset ==
947  found = 2;
948  break;
949  }
950  }
951  }
952  if (!found) {
953  xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
954  ASSERT(0);
955  return XFS_ERROR(EIO);
956  }
957 
958  /* find blk_no of tail of log */
959  rhead = (xlog_rec_header_t *)offset;
960  *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
961 
962  /*
963  * Reset log values according to the state of the log when we
964  * crashed. In the case where head_blk == 0, we bump curr_cycle
965  * one because the next write starts a new cycle rather than
966  * continuing the cycle of the last good log record. At this
967  * point we have guaranteed that all partial log records have been
968  * accounted for. Therefore, we know that the last good log record
969  * written was complete and ended exactly on the end boundary
970  * of the physical log.
971  */
972  log->l_prev_block = i;
973  log->l_curr_block = (int)*head_blk;
974  log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
975  if (found == 2)
976  log->l_curr_cycle++;
977  atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
978  atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
979  xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
980  BBTOB(log->l_curr_block));
981  xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
982  BBTOB(log->l_curr_block));
983 
984  /*
985  * Look for unmount record. If we find it, then we know there
986  * was a clean unmount. Since 'i' could be the last block in
987  * the physical log, we convert to a log block before comparing
988  * to the head_blk.
989  *
990  * Save the current tail lsn to use to pass to
991  * xlog_clear_stale_blocks() below. We won't want to clear the
992  * unmount record if there is one, so we pass the lsn of the
993  * unmount record rather than the block after it.
994  */
995  if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
996  int h_size = be32_to_cpu(rhead->h_size);
997  int h_version = be32_to_cpu(rhead->h_version);
998 
999  if ((h_version & XLOG_VERSION_2) &&
1000  (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1001  hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1002  if (h_size % XLOG_HEADER_CYCLE_SIZE)
1003  hblks++;
1004  } else {
1005  hblks = 1;
1006  }
1007  } else {
1008  hblks = 1;
1009  }
1010  after_umount_blk = (i + hblks + (int)
1011  BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1012  tail_lsn = atomic64_read(&log->l_tail_lsn);
1013  if (*head_blk == after_umount_blk &&
1014  be32_to_cpu(rhead->h_num_logops) == 1) {
1015  umount_data_blk = (i + hblks) % log->l_logBBsize;
1016  error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1017  if (error)
1018  goto done;
1019 
1020  op_head = (xlog_op_header_t *)offset;
1021  if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1022  /*
1023  * Set tail and last sync so that newly written
1024  * log records will point recovery to after the
1025  * current unmount record.
1026  */
1027  xlog_assign_atomic_lsn(&log->l_tail_lsn,
1028  log->l_curr_cycle, after_umount_blk);
1029  xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1030  log->l_curr_cycle, after_umount_blk);
1031  *tail_blk = after_umount_blk;
1032 
1033  /*
1034  * Note that the unmount was clean. If the unmount
1035  * was not clean, we need to know this to rebuild the
1036  * superblock counters from the perag headers if we
1037  * have a filesystem using non-persistent counters.
1038  */
1039  log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1040  }
1041  }
1042 
1043  /*
1044  * Make sure that there are no blocks in front of the head
1045  * with the same cycle number as the head. This can happen
1046  * because we allow multiple outstanding log writes concurrently,
1047  * and the later writes might make it out before earlier ones.
1048  *
1049  * We use the lsn from before modifying it so that we'll never
1050  * overwrite the unmount record after a clean unmount.
1051  *
1052  * Do this only if we are going to recover the filesystem
1053  *
1054  * NOTE: This used to say "if (!readonly)"
1055  * However on Linux, we can & do recover a read-only filesystem.
1056  * We only skip recovery if NORECOVERY is specified on mount,
1057  * in which case we would not be here.
1058  *
1059  * But... if the -device- itself is readonly, just skip this.
1060  * We can't recover this device anyway, so it won't matter.
1061  */
1062  if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1063  error = xlog_clear_stale_blocks(log, tail_lsn);
1064 
1065 done:
1066  xlog_put_bp(bp);
1067 
1068  if (error)
1069  xfs_warn(log->l_mp, "failed to locate log tail");
1070  return error;
1071 }
1072 
1073 /*
1074  * Is the log zeroed at all?
1075  *
1076  * The last binary search should be changed to perform an X block read
1077  * once X becomes small enough. You can then search linearly through
1078  * the X blocks. This will cut down on the number of reads we need to do.
1079  *
1080  * If the log is partially zeroed, this routine will pass back the blkno
1081  * of the first block with cycle number 0. It won't have a complete LR
1082  * preceding it.
1083  *
1084  * Return:
1085  * 0 => the log is completely written to
1086  * -1 => use *blk_no as the first block of the log
1087  * >0 => error has occurred
1088  */
1089 STATIC int
1091  struct xlog *log,
1092  xfs_daddr_t *blk_no)
1093 {
1094  xfs_buf_t *bp;
1095  xfs_caddr_t offset;
1096  uint first_cycle, last_cycle;
1097  xfs_daddr_t new_blk, last_blk, start_blk;
1098  xfs_daddr_t num_scan_bblks;
1099  int error, log_bbnum = log->l_logBBsize;
1100 
1101  *blk_no = 0;
1102 
1103  /* check totally zeroed log */
1104  bp = xlog_get_bp(log, 1);
1105  if (!bp)
1106  return ENOMEM;
1107  error = xlog_bread(log, 0, 1, bp, &offset);
1108  if (error)
1109  goto bp_err;
1110 
1111  first_cycle = xlog_get_cycle(offset);
1112  if (first_cycle == 0) { /* completely zeroed log */
1113  *blk_no = 0;
1114  xlog_put_bp(bp);
1115  return -1;
1116  }
1117 
1118  /* check partially zeroed log */
1119  error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1120  if (error)
1121  goto bp_err;
1122 
1123  last_cycle = xlog_get_cycle(offset);
1124  if (last_cycle != 0) { /* log completely written to */
1125  xlog_put_bp(bp);
1126  return 0;
1127  } else if (first_cycle != 1) {
1128  /*
1129  * If the cycle of the last block is zero, the cycle of
1130  * the first block must be 1. If it's not, maybe we're
1131  * not looking at a log... Bail out.
1132  */
1133  xfs_warn(log->l_mp,
1134  "Log inconsistent or not a log (last==0, first!=1)");
1135  return XFS_ERROR(EINVAL);
1136  }
1137 
1138  /* we have a partially zeroed log */
1139  last_blk = log_bbnum-1;
1140  if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1141  goto bp_err;
1142 
1143  /*
1144  * Validate the answer. Because there is no way to guarantee that
1145  * the entire log is made up of log records which are the same size,
1146  * we scan over the defined maximum blocks. At this point, the maximum
1147  * is not chosen to mean anything special. XXXmiken
1148  */
1149  num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1150  ASSERT(num_scan_bblks <= INT_MAX);
1151 
1152  if (last_blk < num_scan_bblks)
1153  num_scan_bblks = last_blk;
1154  start_blk = last_blk - num_scan_bblks;
1155 
1156  /*
1157  * We search for any instances of cycle number 0 that occur before
1158  * our current estimate of the head. What we're trying to detect is
1159  * 1 ... | 0 | 1 | 0...
1160  * ^ binary search ends here
1161  */
1162  if ((error = xlog_find_verify_cycle(log, start_blk,
1163  (int)num_scan_bblks, 0, &new_blk)))
1164  goto bp_err;
1165  if (new_blk != -1)
1166  last_blk = new_blk;
1167 
1168  /*
1169  * Potentially backup over partial log record write. We don't need
1170  * to search the end of the log because we know it is zero.
1171  */
1172  if ((error = xlog_find_verify_log_record(log, start_blk,
1173  &last_blk, 0)) == -1) {
1174  error = XFS_ERROR(EIO);
1175  goto bp_err;
1176  } else if (error)
1177  goto bp_err;
1178 
1179  *blk_no = last_blk;
1180 bp_err:
1181  xlog_put_bp(bp);
1182  if (error)
1183  return error;
1184  return -1;
1185 }
1186 
1187 /*
1188  * These are simple subroutines used by xlog_clear_stale_blocks() below
1189  * to initialize a buffer full of empty log record headers and write
1190  * them into the log.
1191  */
1192 STATIC void
1194  struct xlog *log,
1195  xfs_caddr_t buf,
1196  int cycle,
1197  int block,
1198  int tail_cycle,
1199  int tail_block)
1200 {
1201  xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1202 
1203  memset(buf, 0, BBSIZE);
1205  recp->h_cycle = cpu_to_be32(cycle);
1206  recp->h_version = cpu_to_be32(
1207  xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1208  recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1209  recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1210  recp->h_fmt = cpu_to_be32(XLOG_FMT);
1211  memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1212 }
1213 
1214 STATIC int
1216  struct xlog *log,
1217  int cycle,
1218  int start_block,
1219  int blocks,
1220  int tail_cycle,
1221  int tail_block)
1222 {
1223  xfs_caddr_t offset;
1224  xfs_buf_t *bp;
1225  int balign, ealign;
1226  int sectbb = log->l_sectBBsize;
1227  int end_block = start_block + blocks;
1228  int bufblks;
1229  int error = 0;
1230  int i, j = 0;
1231 
1232  /*
1233  * Greedily allocate a buffer big enough to handle the full
1234  * range of basic blocks to be written. If that fails, try
1235  * a smaller size. We need to be able to write at least a
1236  * log sector, or we're out of luck.
1237  */
1238  bufblks = 1 << ffs(blocks);
1239  while (bufblks > log->l_logBBsize)
1240  bufblks >>= 1;
1241  while (!(bp = xlog_get_bp(log, bufblks))) {
1242  bufblks >>= 1;
1243  if (bufblks < sectbb)
1244  return ENOMEM;
1245  }
1246 
1247  /* We may need to do a read at the start to fill in part of
1248  * the buffer in the starting sector not covered by the first
1249  * write below.
1250  */
1251  balign = round_down(start_block, sectbb);
1252  if (balign != start_block) {
1253  error = xlog_bread_noalign(log, start_block, 1, bp);
1254  if (error)
1255  goto out_put_bp;
1256 
1257  j = start_block - balign;
1258  }
1259 
1260  for (i = start_block; i < end_block; i += bufblks) {
1261  int bcount, endcount;
1262 
1263  bcount = min(bufblks, end_block - start_block);
1264  endcount = bcount - j;
1265 
1266  /* We may need to do a read at the end to fill in part of
1267  * the buffer in the final sector not covered by the write.
1268  * If this is the same sector as the above read, skip it.
1269  */
1270  ealign = round_down(end_block, sectbb);
1271  if (j == 0 && (start_block + endcount > ealign)) {
1272  offset = bp->b_addr + BBTOB(ealign - start_block);
1273  error = xlog_bread_offset(log, ealign, sectbb,
1274  bp, offset);
1275  if (error)
1276  break;
1277 
1278  }
1279 
1280  offset = xlog_align(log, start_block, endcount, bp);
1281  for (; j < endcount; j++) {
1282  xlog_add_record(log, offset, cycle, i+j,
1283  tail_cycle, tail_block);
1284  offset += BBSIZE;
1285  }
1286  error = xlog_bwrite(log, start_block, endcount, bp);
1287  if (error)
1288  break;
1289  start_block += endcount;
1290  j = 0;
1291  }
1292 
1293  out_put_bp:
1294  xlog_put_bp(bp);
1295  return error;
1296 }
1297 
1298 /*
1299  * This routine is called to blow away any incomplete log writes out
1300  * in front of the log head. We do this so that we won't become confused
1301  * if we come up, write only a little bit more, and then crash again.
1302  * If we leave the partial log records out there, this situation could
1303  * cause us to think those partial writes are valid blocks since they
1304  * have the current cycle number. We get rid of them by overwriting them
1305  * with empty log records with the old cycle number rather than the
1306  * current one.
1307  *
1308  * The tail lsn is passed in rather than taken from
1309  * the log so that we will not write over the unmount record after a
1310  * clean unmount in a 512 block log. Doing so would leave the log without
1311  * any valid log records in it until a new one was written. If we crashed
1312  * during that time we would not be able to recover.
1313  */
1314 STATIC int
1316  struct xlog *log,
1317  xfs_lsn_t tail_lsn)
1318 {
1319  int tail_cycle, head_cycle;
1320  int tail_block, head_block;
1321  int tail_distance, max_distance;
1322  int distance;
1323  int error;
1324 
1325  tail_cycle = CYCLE_LSN(tail_lsn);
1326  tail_block = BLOCK_LSN(tail_lsn);
1327  head_cycle = log->l_curr_cycle;
1328  head_block = log->l_curr_block;
1329 
1330  /*
1331  * Figure out the distance between the new head of the log
1332  * and the tail. We want to write over any blocks beyond the
1333  * head that we may have written just before the crash, but
1334  * we don't want to overwrite the tail of the log.
1335  */
1336  if (head_cycle == tail_cycle) {
1337  /*
1338  * The tail is behind the head in the physical log,
1339  * so the distance from the head to the tail is the
1340  * distance from the head to the end of the log plus
1341  * the distance from the beginning of the log to the
1342  * tail.
1343  */
1344  if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1345  XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1346  XFS_ERRLEVEL_LOW, log->l_mp);
1347  return XFS_ERROR(EFSCORRUPTED);
1348  }
1349  tail_distance = tail_block + (log->l_logBBsize - head_block);
1350  } else {
1351  /*
1352  * The head is behind the tail in the physical log,
1353  * so the distance from the head to the tail is just
1354  * the tail block minus the head block.
1355  */
1356  if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1357  XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1358  XFS_ERRLEVEL_LOW, log->l_mp);
1359  return XFS_ERROR(EFSCORRUPTED);
1360  }
1361  tail_distance = tail_block - head_block;
1362  }
1363 
1364  /*
1365  * If the head is right up against the tail, we can't clear
1366  * anything.
1367  */
1368  if (tail_distance <= 0) {
1369  ASSERT(tail_distance == 0);
1370  return 0;
1371  }
1372 
1373  max_distance = XLOG_TOTAL_REC_SHIFT(log);
1374  /*
1375  * Take the smaller of the maximum amount of outstanding I/O
1376  * we could have and the distance to the tail to clear out.
1377  * We take the smaller so that we don't overwrite the tail and
1378  * we don't waste all day writing from the head to the tail
1379  * for no reason.
1380  */
1381  max_distance = MIN(max_distance, tail_distance);
1382 
1383  if ((head_block + max_distance) <= log->l_logBBsize) {
1384  /*
1385  * We can stomp all the blocks we need to without
1386  * wrapping around the end of the log. Just do it
1387  * in a single write. Use the cycle number of the
1388  * current cycle minus one so that the log will look like:
1389  * n ... | n - 1 ...
1390  */
1391  error = xlog_write_log_records(log, (head_cycle - 1),
1392  head_block, max_distance, tail_cycle,
1393  tail_block);
1394  if (error)
1395  return error;
1396  } else {
1397  /*
1398  * We need to wrap around the end of the physical log in
1399  * order to clear all the blocks. Do it in two separate
1400  * I/Os. The first write should be from the head to the
1401  * end of the physical log, and it should use the current
1402  * cycle number minus one just like above.
1403  */
1404  distance = log->l_logBBsize - head_block;
1405  error = xlog_write_log_records(log, (head_cycle - 1),
1406  head_block, distance, tail_cycle,
1407  tail_block);
1408 
1409  if (error)
1410  return error;
1411 
1412  /*
1413  * Now write the blocks at the start of the physical log.
1414  * This writes the remainder of the blocks we want to clear.
1415  * It uses the current cycle number since we're now on the
1416  * same cycle as the head so that we get:
1417  * n ... n ... | n - 1 ...
1418  * ^^^^^ blocks we're writing
1419  */
1420  distance = max_distance - (log->l_logBBsize - head_block);
1421  error = xlog_write_log_records(log, head_cycle, 0, distance,
1422  tail_cycle, tail_block);
1423  if (error)
1424  return error;
1425  }
1426 
1427  return 0;
1428 }
1429 
1430 /******************************************************************************
1431  *
1432  * Log recover routines
1433  *
1434  ******************************************************************************
1435  */
1436 
1439  struct hlist_head *head,
1440  xlog_tid_t tid)
1441 {
1443  struct hlist_node *n;
1444 
1445  hlist_for_each_entry(trans, n, head, r_list) {
1446  if (trans->r_log_tid == tid)
1447  return trans;
1448  }
1449  return NULL;
1450 }
1451 
1452 STATIC void
1454  struct hlist_head *head,
1455  xlog_tid_t tid,
1456  xfs_lsn_t lsn)
1457 {
1459 
1460  trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1461  trans->r_log_tid = tid;
1462  trans->r_lsn = lsn;
1463  INIT_LIST_HEAD(&trans->r_itemq);
1464 
1465  INIT_HLIST_NODE(&trans->r_list);
1466  hlist_add_head(&trans->r_list, head);
1467 }
1468 
1469 STATIC void
1471  struct list_head *head)
1472 {
1474 
1475  item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1476  INIT_LIST_HEAD(&item->ri_list);
1477  list_add_tail(&item->ri_list, head);
1478 }
1479 
1480 STATIC int
1482  struct xlog *log,
1483  struct xlog_recover *trans,
1484  xfs_caddr_t dp,
1485  int len)
1486 {
1488  xfs_caddr_t ptr, old_ptr;
1489  int old_len;
1490 
1491  if (list_empty(&trans->r_itemq)) {
1492  /* finish copying rest of trans header */
1493  xlog_recover_add_item(&trans->r_itemq);
1494  ptr = (xfs_caddr_t) &trans->r_theader +
1495  sizeof(xfs_trans_header_t) - len;
1496  memcpy(ptr, dp, len); /* d, s, l */
1497  return 0;
1498  }
1499  /* take the tail entry */
1500  item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1501 
1502  old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1503  old_len = item->ri_buf[item->ri_cnt-1].i_len;
1504 
1505  ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1506  memcpy(&ptr[old_len], dp, len); /* d, s, l */
1507  item->ri_buf[item->ri_cnt-1].i_len += len;
1508  item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1509  trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1510  return 0;
1511 }
1512 
1513 /*
1514  * The next region to add is the start of a new region. It could be
1515  * a whole region or it could be the first part of a new region. Because
1516  * of this, the assumption here is that the type and size fields of all
1517  * format structures fit into the first 32 bits of the structure.
1518  *
1519  * This works because all regions must be 32 bit aligned. Therefore, we
1520  * either have both fields or we have neither field. In the case we have
1521  * neither field, the data part of the region is zero length. We only have
1522  * a log_op_header and can throw away the header since a new one will appear
1523  * later. If we have at least 4 bytes, then we can determine how many regions
1524  * will appear in the current log item.
1525  */
1526 STATIC int
1528  struct xlog *log,
1529  struct xlog_recover *trans,
1530  xfs_caddr_t dp,
1531  int len)
1532 {
1533  xfs_inode_log_format_t *in_f; /* any will do */
1535  xfs_caddr_t ptr;
1536 
1537  if (!len)
1538  return 0;
1539  if (list_empty(&trans->r_itemq)) {
1540  /* we need to catch log corruptions here */
1541  if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1542  xfs_warn(log->l_mp, "%s: bad header magic number",
1543  __func__);
1544  ASSERT(0);
1545  return XFS_ERROR(EIO);
1546  }
1547  if (len == sizeof(xfs_trans_header_t))
1548  xlog_recover_add_item(&trans->r_itemq);
1549  memcpy(&trans->r_theader, dp, len); /* d, s, l */
1550  return 0;
1551  }
1552 
1553  ptr = kmem_alloc(len, KM_SLEEP);
1554  memcpy(ptr, dp, len);
1555  in_f = (xfs_inode_log_format_t *)ptr;
1556 
1557  /* take the tail entry */
1558  item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1559  if (item->ri_total != 0 &&
1560  item->ri_total == item->ri_cnt) {
1561  /* tail item is in use, get a new one */
1562  xlog_recover_add_item(&trans->r_itemq);
1563  item = list_entry(trans->r_itemq.prev,
1564  xlog_recover_item_t, ri_list);
1565  }
1566 
1567  if (item->ri_total == 0) { /* first region to be added */
1568  if (in_f->ilf_size == 0 ||
1570  xfs_warn(log->l_mp,
1571  "bad number of regions (%d) in inode log format",
1572  in_f->ilf_size);
1573  ASSERT(0);
1574  return XFS_ERROR(EIO);
1575  }
1576 
1577  item->ri_total = in_f->ilf_size;
1578  item->ri_buf =
1579  kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1580  KM_SLEEP);
1581  }
1582  ASSERT(item->ri_total > item->ri_cnt);
1583  /* Description region is ri_buf[0] */
1584  item->ri_buf[item->ri_cnt].i_addr = ptr;
1585  item->ri_buf[item->ri_cnt].i_len = len;
1586  item->ri_cnt++;
1587  trace_xfs_log_recover_item_add(log, trans, item, 0);
1588  return 0;
1589 }
1590 
1591 /*
1592  * Sort the log items in the transaction. Cancelled buffers need
1593  * to be put first so they are processed before any items that might
1594  * modify the buffers. If they are cancelled, then the modifications
1595  * don't need to be replayed.
1596  */
1597 STATIC int
1599  struct xlog *log,
1600  struct xlog_recover *trans,
1601  int pass)
1602 {
1604  LIST_HEAD(sort_list);
1605 
1606  list_splice_init(&trans->r_itemq, &sort_list);
1607  list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1608  xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1609 
1610  switch (ITEM_TYPE(item)) {
1611  case XFS_LI_BUF:
1612  if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1613  trace_xfs_log_recover_item_reorder_head(log,
1614  trans, item, pass);
1615  list_move(&item->ri_list, &trans->r_itemq);
1616  break;
1617  }
1618  case XFS_LI_INODE:
1619  case XFS_LI_DQUOT:
1620  case XFS_LI_QUOTAOFF:
1621  case XFS_LI_EFD:
1622  case XFS_LI_EFI:
1623  trace_xfs_log_recover_item_reorder_tail(log,
1624  trans, item, pass);
1625  list_move_tail(&item->ri_list, &trans->r_itemq);
1626  break;
1627  default:
1628  xfs_warn(log->l_mp,
1629  "%s: unrecognized type of log operation",
1630  __func__);
1631  ASSERT(0);
1632  return XFS_ERROR(EIO);
1633  }
1634  }
1635  ASSERT(list_empty(&sort_list));
1636  return 0;
1637 }
1638 
1639 /*
1640  * Build up the table of buf cancel records so that we don't replay
1641  * cancelled data in the second pass. For buffer records that are
1642  * not cancel records, there is nothing to do here so we just return.
1643  *
1644  * If we get a cancel record which is already in the table, this indicates
1645  * that the buffer was cancelled multiple times. In order to ensure
1646  * that during pass 2 we keep the record in the table until we reach its
1647  * last occurrence in the log, we keep a reference count in the cancel
1648  * record in the table to tell us how many times we expect to see this
1649  * record during the second pass.
1650  */
1651 STATIC int
1653  struct xlog *log,
1654  struct xlog_recover_item *item)
1655 {
1656  xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1657  struct list_head *bucket;
1658  struct xfs_buf_cancel *bcp;
1659 
1660  /*
1661  * If this isn't a cancel buffer item, then just return.
1662  */
1663  if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1664  trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1665  return 0;
1666  }
1667 
1668  /*
1669  * Insert an xfs_buf_cancel record into the hash table of them.
1670  * If there is already an identical record, bump its reference count.
1671  */
1672  bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1673  list_for_each_entry(bcp, bucket, bc_list) {
1674  if (bcp->bc_blkno == buf_f->blf_blkno &&
1675  bcp->bc_len == buf_f->blf_len) {
1676  bcp->bc_refcount++;
1677  trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1678  return 0;
1679  }
1680  }
1681 
1682  bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1683  bcp->bc_blkno = buf_f->blf_blkno;
1684  bcp->bc_len = buf_f->blf_len;
1685  bcp->bc_refcount = 1;
1686  list_add_tail(&bcp->bc_list, bucket);
1687 
1688  trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1689  return 0;
1690 }
1691 
1692 /*
1693  * Check to see whether the buffer being recovered has a corresponding
1694  * entry in the buffer cancel record table. If it does then return 1
1695  * so that it will be cancelled, otherwise return 0. If the buffer is
1696  * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1697  * the refcount on the entry in the table and remove it from the table
1698  * if this is the last reference.
1699  *
1700  * We remove the cancel record from the table when we encounter its
1701  * last occurrence in the log so that if the same buffer is re-used
1702  * again after its last cancellation we actually replay the changes
1703  * made at that point.
1704  */
1705 STATIC int
1707  struct xlog *log,
1708  xfs_daddr_t blkno,
1709  uint len,
1710  ushort flags)
1711 {
1712  struct list_head *bucket;
1713  struct xfs_buf_cancel *bcp;
1714 
1715  if (log->l_buf_cancel_table == NULL) {
1716  /*
1717  * There is nothing in the table built in pass one,
1718  * so this buffer must not be cancelled.
1719  */
1720  ASSERT(!(flags & XFS_BLF_CANCEL));
1721  return 0;
1722  }
1723 
1724  /*
1725  * Search for an entry in the cancel table that matches our buffer.
1726  */
1727  bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1728  list_for_each_entry(bcp, bucket, bc_list) {
1729  if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1730  goto found;
1731  }
1732 
1733  /*
1734  * We didn't find a corresponding entry in the table, so return 0 so
1735  * that the buffer is NOT cancelled.
1736  */
1737  ASSERT(!(flags & XFS_BLF_CANCEL));
1738  return 0;
1739 
1740 found:
1741  /*
1742  * We've go a match, so return 1 so that the recovery of this buffer
1743  * is cancelled. If this buffer is actually a buffer cancel log
1744  * item, then decrement the refcount on the one in the table and
1745  * remove it if this is the last reference.
1746  */
1747  if (flags & XFS_BLF_CANCEL) {
1748  if (--bcp->bc_refcount == 0) {
1749  list_del(&bcp->bc_list);
1750  kmem_free(bcp);
1751  }
1752  }
1753  return 1;
1754 }
1755 
1756 /*
1757  * Perform recovery for a buffer full of inodes. In these buffers, the only
1758  * data which should be recovered is that which corresponds to the
1759  * di_next_unlinked pointers in the on disk inode structures. The rest of the
1760  * data for the inodes is always logged through the inodes themselves rather
1761  * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1762  *
1763  * The only time when buffers full of inodes are fully recovered is when the
1764  * buffer is full of newly allocated inodes. In this case the buffer will
1765  * not be marked as an inode buffer and so will be sent to
1766  * xlog_recover_do_reg_buffer() below during recovery.
1767  */
1768 STATIC int
1770  struct xfs_mount *mp,
1772  struct xfs_buf *bp,
1773  xfs_buf_log_format_t *buf_f)
1774 {
1775  int i;
1776  int item_index = 0;
1777  int bit = 0;
1778  int nbits = 0;
1779  int reg_buf_offset = 0;
1780  int reg_buf_bytes = 0;
1781  int next_unlinked_offset;
1782  int inodes_per_buf;
1783  xfs_agino_t *logged_nextp;
1784  xfs_agino_t *buffer_nextp;
1785 
1786  trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1787 
1788  inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1789  for (i = 0; i < inodes_per_buf; i++) {
1790  next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1791  offsetof(xfs_dinode_t, di_next_unlinked);
1792 
1793  while (next_unlinked_offset >=
1794  (reg_buf_offset + reg_buf_bytes)) {
1795  /*
1796  * The next di_next_unlinked field is beyond
1797  * the current logged region. Find the next
1798  * logged region that contains or is beyond
1799  * the current di_next_unlinked field.
1800  */
1801  bit += nbits;
1802  bit = xfs_next_bit(buf_f->blf_data_map,
1803  buf_f->blf_map_size, bit);
1804 
1805  /*
1806  * If there are no more logged regions in the
1807  * buffer, then we're done.
1808  */
1809  if (bit == -1)
1810  return 0;
1811 
1812  nbits = xfs_contig_bits(buf_f->blf_data_map,
1813  buf_f->blf_map_size, bit);
1814  ASSERT(nbits > 0);
1815  reg_buf_offset = bit << XFS_BLF_SHIFT;
1816  reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1817  item_index++;
1818  }
1819 
1820  /*
1821  * If the current logged region starts after the current
1822  * di_next_unlinked field, then move on to the next
1823  * di_next_unlinked field.
1824  */
1825  if (next_unlinked_offset < reg_buf_offset)
1826  continue;
1827 
1828  ASSERT(item->ri_buf[item_index].i_addr != NULL);
1829  ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1830  ASSERT((reg_buf_offset + reg_buf_bytes) <=
1831  BBTOB(bp->b_io_length));
1832 
1833  /*
1834  * The current logged region contains a copy of the
1835  * current di_next_unlinked field. Extract its value
1836  * and copy it to the buffer copy.
1837  */
1838  logged_nextp = item->ri_buf[item_index].i_addr +
1839  next_unlinked_offset - reg_buf_offset;
1840  if (unlikely(*logged_nextp == 0)) {
1841  xfs_alert(mp,
1842  "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1843  "Trying to replay bad (0) inode di_next_unlinked field.",
1844  item, bp);
1845  XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1846  XFS_ERRLEVEL_LOW, mp);
1847  return XFS_ERROR(EFSCORRUPTED);
1848  }
1849 
1850  buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1851  next_unlinked_offset);
1852  *buffer_nextp = *logged_nextp;
1853  }
1854 
1855  return 0;
1856 }
1857 
1858 /*
1859  * Perform a 'normal' buffer recovery. Each logged region of the
1860  * buffer should be copied over the corresponding region in the
1861  * given buffer. The bitmap in the buf log format structure indicates
1862  * where to place the logged data.
1863  */
1864 STATIC void
1866  struct xfs_mount *mp,
1868  struct xfs_buf *bp,
1869  xfs_buf_log_format_t *buf_f)
1870 {
1871  int i;
1872  int bit;
1873  int nbits;
1874  int error;
1875 
1876  trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1877 
1878  bit = 0;
1879  i = 1; /* 0 is the buf format structure */
1880  while (1) {
1881  bit = xfs_next_bit(buf_f->blf_data_map,
1882  buf_f->blf_map_size, bit);
1883  if (bit == -1)
1884  break;
1885  nbits = xfs_contig_bits(buf_f->blf_data_map,
1886  buf_f->blf_map_size, bit);
1887  ASSERT(nbits > 0);
1888  ASSERT(item->ri_buf[i].i_addr != NULL);
1889  ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1890  ASSERT(BBTOB(bp->b_io_length) >=
1891  ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
1892 
1893  /*
1894  * Do a sanity check if this is a dquot buffer. Just checking
1895  * the first dquot in the buffer should do. XXXThis is
1896  * probably a good thing to do for other buf types also.
1897  */
1898  error = 0;
1899  if (buf_f->blf_flags &
1901  if (item->ri_buf[i].i_addr == NULL) {
1902  xfs_alert(mp,
1903  "XFS: NULL dquot in %s.", __func__);
1904  goto next;
1905  }
1906  if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1907  xfs_alert(mp,
1908  "XFS: dquot too small (%d) in %s.",
1909  item->ri_buf[i].i_len, __func__);
1910  goto next;
1911  }
1912  error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1913  -1, 0, XFS_QMOPT_DOWARN,
1914  "dquot_buf_recover");
1915  if (error)
1916  goto next;
1917  }
1918 
1920  (uint)bit << XFS_BLF_SHIFT), /* dest */
1921  item->ri_buf[i].i_addr, /* source */
1922  nbits<<XFS_BLF_SHIFT); /* length */
1923  next:
1924  i++;
1925  bit += nbits;
1926  }
1927 
1928  /* Shouldn't be any more regions */
1929  ASSERT(i == item->ri_total);
1930 }
1931 
1932 /*
1933  * Do some primitive error checking on ondisk dquot data structures.
1934  */
1935 int
1937  struct xfs_mount *mp,
1938  xfs_disk_dquot_t *ddq,
1939  xfs_dqid_t id,
1940  uint type, /* used only when IO_dorepair is true */
1941  uint flags,
1942  char *str)
1943 {
1944  xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
1945  int errs = 0;
1946 
1947  /*
1948  * We can encounter an uninitialized dquot buffer for 2 reasons:
1949  * 1. If we crash while deleting the quotainode(s), and those blks got
1950  * used for user data. This is because we take the path of regular
1951  * file deletion; however, the size field of quotainodes is never
1952  * updated, so all the tricks that we play in itruncate_finish
1953  * don't quite matter.
1954  *
1955  * 2. We don't play the quota buffers when there's a quotaoff logitem.
1956  * But the allocation will be replayed so we'll end up with an
1957  * uninitialized quota block.
1958  *
1959  * This is all fine; things are still consistent, and we haven't lost
1960  * any quota information. Just don't complain about bad dquot blks.
1961  */
1962  if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
1963  if (flags & XFS_QMOPT_DOWARN)
1964  xfs_alert(mp,
1965  "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1966  str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1967  errs++;
1968  }
1969  if (ddq->d_version != XFS_DQUOT_VERSION) {
1970  if (flags & XFS_QMOPT_DOWARN)
1971  xfs_alert(mp,
1972  "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1973  str, id, ddq->d_version, XFS_DQUOT_VERSION);
1974  errs++;
1975  }
1976 
1977  if (ddq->d_flags != XFS_DQ_USER &&
1978  ddq->d_flags != XFS_DQ_PROJ &&
1979  ddq->d_flags != XFS_DQ_GROUP) {
1980  if (flags & XFS_QMOPT_DOWARN)
1981  xfs_alert(mp,
1982  "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1983  str, id, ddq->d_flags);
1984  errs++;
1985  }
1986 
1987  if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1988  if (flags & XFS_QMOPT_DOWARN)
1989  xfs_alert(mp,
1990  "%s : ondisk-dquot 0x%p, ID mismatch: "
1991  "0x%x expected, found id 0x%x",
1992  str, ddq, id, be32_to_cpu(ddq->d_id));
1993  errs++;
1994  }
1995 
1996  if (!errs && ddq->d_id) {
1997  if (ddq->d_blk_softlimit &&
1998  be64_to_cpu(ddq->d_bcount) >
1999  be64_to_cpu(ddq->d_blk_softlimit)) {
2000  if (!ddq->d_btimer) {
2001  if (flags & XFS_QMOPT_DOWARN)
2002  xfs_alert(mp,
2003  "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
2004  str, (int)be32_to_cpu(ddq->d_id), ddq);
2005  errs++;
2006  }
2007  }
2008  if (ddq->d_ino_softlimit &&
2009  be64_to_cpu(ddq->d_icount) >
2010  be64_to_cpu(ddq->d_ino_softlimit)) {
2011  if (!ddq->d_itimer) {
2012  if (flags & XFS_QMOPT_DOWARN)
2013  xfs_alert(mp,
2014  "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2015  str, (int)be32_to_cpu(ddq->d_id), ddq);
2016  errs++;
2017  }
2018  }
2019  if (ddq->d_rtb_softlimit &&
2020  be64_to_cpu(ddq->d_rtbcount) >
2021  be64_to_cpu(ddq->d_rtb_softlimit)) {
2022  if (!ddq->d_rtbtimer) {
2023  if (flags & XFS_QMOPT_DOWARN)
2024  xfs_alert(mp,
2025  "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2026  str, (int)be32_to_cpu(ddq->d_id), ddq);
2027  errs++;
2028  }
2029  }
2030  }
2031 
2032  if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2033  return errs;
2034 
2035  if (flags & XFS_QMOPT_DOWARN)
2036  xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2037 
2038  /*
2039  * Typically, a repair is only requested by quotacheck.
2040  */
2041  ASSERT(id != -1);
2042  ASSERT(flags & XFS_QMOPT_DQREPAIR);
2043  memset(d, 0, sizeof(xfs_dqblk_t));
2044 
2047  d->dd_diskdq.d_flags = type;
2048  d->dd_diskdq.d_id = cpu_to_be32(id);
2049 
2050  return errs;
2051 }
2052 
2053 /*
2054  * Perform a dquot buffer recovery.
2055  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2056  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2057  * Else, treat it as a regular buffer and do recovery.
2058  */
2059 STATIC void
2061  struct xfs_mount *mp,
2062  struct xlog *log,
2063  struct xlog_recover_item *item,
2064  struct xfs_buf *bp,
2065  struct xfs_buf_log_format *buf_f)
2066 {
2067  uint type;
2068 
2069  trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2070 
2071  /*
2072  * Filesystems are required to send in quota flags at mount time.
2073  */
2074  if (mp->m_qflags == 0) {
2075  return;
2076  }
2077 
2078  type = 0;
2079  if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2080  type |= XFS_DQ_USER;
2081  if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2082  type |= XFS_DQ_PROJ;
2083  if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2084  type |= XFS_DQ_GROUP;
2085  /*
2086  * This type of quotas was turned off, so ignore this buffer
2087  */
2088  if (log->l_quotaoffs_flag & type)
2089  return;
2090 
2091  xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2092 }
2093 
2094 /*
2095  * This routine replays a modification made to a buffer at runtime.
2096  * There are actually two types of buffer, regular and inode, which
2097  * are handled differently. Inode buffers are handled differently
2098  * in that we only recover a specific set of data from them, namely
2099  * the inode di_next_unlinked fields. This is because all other inode
2100  * data is actually logged via inode records and any data we replay
2101  * here which overlaps that may be stale.
2102  *
2103  * When meta-data buffers are freed at run time we log a buffer item
2104  * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2105  * of the buffer in the log should not be replayed at recovery time.
2106  * This is so that if the blocks covered by the buffer are reused for
2107  * file data before we crash we don't end up replaying old, freed
2108  * meta-data into a user's file.
2109  *
2110  * To handle the cancellation of buffer log items, we make two passes
2111  * over the log during recovery. During the first we build a table of
2112  * those buffers which have been cancelled, and during the second we
2113  * only replay those buffers which do not have corresponding cancel
2114  * records in the table. See xlog_recover_do_buffer_pass[1,2] above
2115  * for more details on the implementation of the table of cancel records.
2116  */
2117 STATIC int
2119  struct xlog *log,
2120  struct list_head *buffer_list,
2121  struct xlog_recover_item *item)
2122 {
2123  xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2124  xfs_mount_t *mp = log->l_mp;
2125  xfs_buf_t *bp;
2126  int error;
2127  uint buf_flags;
2128 
2129  /*
2130  * In this pass we only want to recover all the buffers which have
2131  * not been cancelled and are not cancellation buffers themselves.
2132  */
2133  if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2134  buf_f->blf_len, buf_f->blf_flags)) {
2135  trace_xfs_log_recover_buf_cancel(log, buf_f);
2136  return 0;
2137  }
2138 
2139  trace_xfs_log_recover_buf_recover(log, buf_f);
2140 
2141  buf_flags = 0;
2142  if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2143  buf_flags |= XBF_UNMAPPED;
2144 
2145  bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146  buf_flags);
2147  if (!bp)
2148  return XFS_ERROR(ENOMEM);
2149  error = bp->b_error;
2150  if (error) {
2151  xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2152  xfs_buf_relse(bp);
2153  return error;
2154  }
2155 
2156  if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2157  error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2158  } else if (buf_f->blf_flags &
2160  xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2161  } else {
2162  xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2163  }
2164  if (error)
2165  return XFS_ERROR(error);
2166 
2167  /*
2168  * Perform delayed write on the buffer. Asynchronous writes will be
2169  * slower when taking into account all the buffers to be flushed.
2170  *
2171  * Also make sure that only inode buffers with good sizes stay in
2172  * the buffer cache. The kernel moves inodes in buffers of 1 block
2173  * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
2174  * buffers in the log can be a different size if the log was generated
2175  * by an older kernel using unclustered inode buffers or a newer kernel
2176  * running with a different inode cluster size. Regardless, if the
2177  * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2178  * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2179  * the buffer out of the buffer cache so that the buffer won't
2180  * overlap with future reads of those inodes.
2181  */
2182  if (XFS_DINODE_MAGIC ==
2183  be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2184  (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2185  (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2186  xfs_buf_stale(bp);
2187  error = xfs_bwrite(bp);
2188  } else {
2189  ASSERT(bp->b_target->bt_mount == mp);
2191  xfs_buf_delwri_queue(bp, buffer_list);
2192  }
2193 
2194  xfs_buf_relse(bp);
2195  return error;
2196 }
2197 
2198 STATIC int
2200  struct xlog *log,
2201  struct list_head *buffer_list,
2202  struct xlog_recover_item *item)
2203 {
2204  xfs_inode_log_format_t *in_f;
2205  xfs_mount_t *mp = log->l_mp;
2206  xfs_buf_t *bp;
2207  xfs_dinode_t *dip;
2208  int len;
2209  xfs_caddr_t src;
2210  xfs_caddr_t dest;
2211  int error;
2212  int attr_index;
2213  uint fields;
2214  xfs_icdinode_t *dicp;
2215  int need_free = 0;
2216 
2217  if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2218  in_f = item->ri_buf[0].i_addr;
2219  } else {
2220  in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2221  need_free = 1;
2222  error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2223  if (error)
2224  goto error;
2225  }
2226 
2227  /*
2228  * Inode buffers can be freed, look out for it,
2229  * and do not replay the inode.
2230  */
2231  if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2232  in_f->ilf_len, 0)) {
2233  error = 0;
2234  trace_xfs_log_recover_inode_cancel(log, in_f);
2235  goto error;
2236  }
2237  trace_xfs_log_recover_inode_recover(log, in_f);
2238 
2239  bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
2240  if (!bp) {
2241  error = ENOMEM;
2242  goto error;
2243  }
2244  error = bp->b_error;
2245  if (error) {
2246  xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2247  xfs_buf_relse(bp);
2248  goto error;
2249  }
2250  ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2251  dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2252 
2253  /*
2254  * Make sure the place we're flushing out to really looks
2255  * like an inode!
2256  */
2257  if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2258  xfs_buf_relse(bp);
2259  xfs_alert(mp,
2260  "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2261  __func__, dip, bp, in_f->ilf_ino);
2262  XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2263  XFS_ERRLEVEL_LOW, mp);
2264  error = EFSCORRUPTED;
2265  goto error;
2266  }
2267  dicp = item->ri_buf[1].i_addr;
2268  if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2269  xfs_buf_relse(bp);
2270  xfs_alert(mp,
2271  "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2272  __func__, item, in_f->ilf_ino);
2273  XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2274  XFS_ERRLEVEL_LOW, mp);
2275  error = EFSCORRUPTED;
2276  goto error;
2277  }
2278 
2279  /* Skip replay when the on disk inode is newer than the log one */
2280  if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2281  /*
2282  * Deal with the wrap case, DI_MAX_FLUSH is less
2283  * than smaller numbers
2284  */
2285  if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2286  dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2287  /* do nothing */
2288  } else {
2289  xfs_buf_relse(bp);
2290  trace_xfs_log_recover_inode_skip(log, in_f);
2291  error = 0;
2292  goto error;
2293  }
2294  }
2295  /* Take the opportunity to reset the flush iteration count */
2296  dicp->di_flushiter = 0;
2297 
2298  if (unlikely(S_ISREG(dicp->di_mode))) {
2299  if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2300  (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2301  XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2302  XFS_ERRLEVEL_LOW, mp, dicp);
2303  xfs_buf_relse(bp);
2304  xfs_alert(mp,
2305  "%s: Bad regular inode log record, rec ptr 0x%p, "
2306  "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2307  __func__, item, dip, bp, in_f->ilf_ino);
2308  error = EFSCORRUPTED;
2309  goto error;
2310  }
2311  } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2312  if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2313  (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2314  (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2315  XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2316  XFS_ERRLEVEL_LOW, mp, dicp);
2317  xfs_buf_relse(bp);
2318  xfs_alert(mp,
2319  "%s: Bad dir inode log record, rec ptr 0x%p, "
2320  "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2321  __func__, item, dip, bp, in_f->ilf_ino);
2322  error = EFSCORRUPTED;
2323  goto error;
2324  }
2325  }
2326  if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2327  XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2328  XFS_ERRLEVEL_LOW, mp, dicp);
2329  xfs_buf_relse(bp);
2330  xfs_alert(mp,
2331  "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2332  "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2333  __func__, item, dip, bp, in_f->ilf_ino,
2334  dicp->di_nextents + dicp->di_anextents,
2335  dicp->di_nblocks);
2336  error = EFSCORRUPTED;
2337  goto error;
2338  }
2339  if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2340  XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2341  XFS_ERRLEVEL_LOW, mp, dicp);
2342  xfs_buf_relse(bp);
2343  xfs_alert(mp,
2344  "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2345  "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2346  item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2347  error = EFSCORRUPTED;
2348  goto error;
2349  }
2350  if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2351  XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2352  XFS_ERRLEVEL_LOW, mp, dicp);
2353  xfs_buf_relse(bp);
2354  xfs_alert(mp,
2355  "%s: Bad inode log record length %d, rec ptr 0x%p",
2356  __func__, item->ri_buf[1].i_len, item);
2357  error = EFSCORRUPTED;
2358  goto error;
2359  }
2360 
2361  /* The core is in in-core format */
2362  xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
2363 
2364  /* the rest is in on-disk format */
2365  if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2366  memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2367  item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2368  item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
2369  }
2370 
2371  fields = in_f->ilf_fields;
2372  switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2373  case XFS_ILOG_DEV:
2374  xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2375  break;
2376  case XFS_ILOG_UUID:
2377  memcpy(XFS_DFORK_DPTR(dip),
2378  &in_f->ilf_u.ilfu_uuid,
2379  sizeof(uuid_t));
2380  break;
2381  }
2382 
2383  if (in_f->ilf_size == 2)
2384  goto write_inode_buffer;
2385  len = item->ri_buf[2].i_len;
2386  src = item->ri_buf[2].i_addr;
2387  ASSERT(in_f->ilf_size <= 4);
2388  ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2389  ASSERT(!(fields & XFS_ILOG_DFORK) ||
2390  (len == in_f->ilf_dsize));
2391 
2392  switch (fields & XFS_ILOG_DFORK) {
2393  case XFS_ILOG_DDATA:
2394  case XFS_ILOG_DEXT:
2395  memcpy(XFS_DFORK_DPTR(dip), src, len);
2396  break;
2397 
2398  case XFS_ILOG_DBROOT:
2399  xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2401  XFS_DFORK_DSIZE(dip, mp));
2402  break;
2403 
2404  default:
2405  /*
2406  * There are no data fork flags set.
2407  */
2408  ASSERT((fields & XFS_ILOG_DFORK) == 0);
2409  break;
2410  }
2411 
2412  /*
2413  * If we logged any attribute data, recover it. There may or
2414  * may not have been any other non-core data logged in this
2415  * transaction.
2416  */
2417  if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2418  if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2419  attr_index = 3;
2420  } else {
2421  attr_index = 2;
2422  }
2423  len = item->ri_buf[attr_index].i_len;
2424  src = item->ri_buf[attr_index].i_addr;
2425  ASSERT(len == in_f->ilf_asize);
2426 
2427  switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2428  case XFS_ILOG_ADATA:
2429  case XFS_ILOG_AEXT:
2430  dest = XFS_DFORK_APTR(dip);
2431  ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2432  memcpy(dest, src, len);
2433  break;
2434 
2435  case XFS_ILOG_ABROOT:
2436  dest = XFS_DFORK_APTR(dip);
2437  xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2438  len, (xfs_bmdr_block_t*)dest,
2439  XFS_DFORK_ASIZE(dip, mp));
2440  break;
2441 
2442  default:
2443  xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2444  ASSERT(0);
2445  xfs_buf_relse(bp);
2446  error = EIO;
2447  goto error;
2448  }
2449  }
2450 
2451 write_inode_buffer:
2452  ASSERT(bp->b_target->bt_mount == mp);
2454  xfs_buf_delwri_queue(bp, buffer_list);
2455  xfs_buf_relse(bp);
2456 error:
2457  if (need_free)
2458  kmem_free(in_f);
2459  return XFS_ERROR(error);
2460 }
2461 
2462 /*
2463  * Recover QUOTAOFF records. We simply make a note of it in the xlog
2464  * structure, so that we know not to do any dquot item or dquot buffer recovery,
2465  * of that type.
2466  */
2467 STATIC int
2469  struct xlog *log,
2470  struct xlog_recover_item *item)
2471 {
2472  xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2473  ASSERT(qoff_f);
2474 
2475  /*
2476  * The logitem format's flag tells us if this was user quotaoff,
2477  * group/project quotaoff or both.
2478  */
2479  if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2480  log->l_quotaoffs_flag |= XFS_DQ_USER;
2481  if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2482  log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2483  if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2484  log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2485 
2486  return (0);
2487 }
2488 
2489 /*
2490  * Recover a dquot record
2491  */
2492 STATIC int
2494  struct xlog *log,
2495  struct list_head *buffer_list,
2496  struct xlog_recover_item *item)
2497 {
2498  xfs_mount_t *mp = log->l_mp;
2499  xfs_buf_t *bp;
2500  struct xfs_disk_dquot *ddq, *recddq;
2501  int error;
2502  xfs_dq_logformat_t *dq_f;
2503  uint type;
2504 
2505 
2506  /*
2507  * Filesystems are required to send in quota flags at mount time.
2508  */
2509  if (mp->m_qflags == 0)
2510  return (0);
2511 
2512  recddq = item->ri_buf[1].i_addr;
2513  if (recddq == NULL) {
2514  xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2515  return XFS_ERROR(EIO);
2516  }
2517  if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2518  xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2519  item->ri_buf[1].i_len, __func__);
2520  return XFS_ERROR(EIO);
2521  }
2522 
2523  /*
2524  * This type of quotas was turned off, so ignore this record.
2525  */
2526  type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2527  ASSERT(type);
2528  if (log->l_quotaoffs_flag & type)
2529  return (0);
2530 
2531  /*
2532  * At this point we know that quota was _not_ turned off.
2533  * Since the mount flags are not indicating to us otherwise, this
2534  * must mean that quota is on, and the dquot needs to be replayed.
2535  * Remember that we may not have fully recovered the superblock yet,
2536  * so we can't do the usual trick of looking at the SB quota bits.
2537  *
2538  * The other possibility, of course, is that the quota subsystem was
2539  * removed since the last mount - ENOSYS.
2540  */
2541  dq_f = item->ri_buf[0].i_addr;
2542  ASSERT(dq_f);
2543  error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2544  "xlog_recover_dquot_pass2 (log copy)");
2545  if (error)
2546  return XFS_ERROR(EIO);
2547  ASSERT(dq_f->qlf_len == 1);
2548 
2549  error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550  XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
2551  if (error)
2552  return error;
2553 
2554  ASSERT(bp);
2555  ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2556 
2557  /*
2558  * At least the magic num portion should be on disk because this
2559  * was among a chunk of dquots created earlier, and we did some
2560  * minimal initialization then.
2561  */
2562  error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2563  "xlog_recover_dquot_pass2");
2564  if (error) {
2565  xfs_buf_relse(bp);
2566  return XFS_ERROR(EIO);
2567  }
2568 
2569  memcpy(ddq, recddq, item->ri_buf[1].i_len);
2570 
2571  ASSERT(dq_f->qlf_size == 2);
2572  ASSERT(bp->b_target->bt_mount == mp);
2574  xfs_buf_delwri_queue(bp, buffer_list);
2575  xfs_buf_relse(bp);
2576 
2577  return (0);
2578 }
2579 
2580 /*
2581  * This routine is called to create an in-core extent free intent
2582  * item from the efi format structure which was logged on disk.
2583  * It allocates an in-core efi, copies the extents from the format
2584  * structure into it, and adds the efi to the AIL with the given
2585  * LSN.
2586  */
2587 STATIC int
2589  struct xlog *log,
2590  struct xlog_recover_item *item,
2591  xfs_lsn_t lsn)
2592 {
2593  int error;
2594  xfs_mount_t *mp = log->l_mp;
2595  xfs_efi_log_item_t *efip;
2596  xfs_efi_log_format_t *efi_formatp;
2597 
2598  efi_formatp = item->ri_buf[0].i_addr;
2599 
2600  efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2601  if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2602  &(efip->efi_format)))) {
2603  xfs_efi_item_free(efip);
2604  return error;
2605  }
2606  atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2607 
2608  spin_lock(&log->l_ailp->xa_lock);
2609  /*
2610  * xfs_trans_ail_update() drops the AIL lock.
2611  */
2612  xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2613  return 0;
2614 }
2615 
2616 
2617 /*
2618  * This routine is called when an efd format structure is found in
2619  * a committed transaction in the log. It's purpose is to cancel
2620  * the corresponding efi if it was still in the log. To do this
2621  * it searches the AIL for the efi with an id equal to that in the
2622  * efd format structure. If we find it, we remove the efi from the
2623  * AIL and free it.
2624  */
2625 STATIC int
2627  struct xlog *log,
2628  struct xlog_recover_item *item)
2629 {
2630  xfs_efd_log_format_t *efd_formatp;
2631  xfs_efi_log_item_t *efip = NULL;
2632  xfs_log_item_t *lip;
2633  __uint64_t efi_id;
2634  struct xfs_ail_cursor cur;
2635  struct xfs_ail *ailp = log->l_ailp;
2636 
2637  efd_formatp = item->ri_buf[0].i_addr;
2638  ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2639  ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2640  (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2641  ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2642  efi_id = efd_formatp->efd_efi_id;
2643 
2644  /*
2645  * Search for the efi with the id in the efd format structure
2646  * in the AIL.
2647  */
2648  spin_lock(&ailp->xa_lock);
2649  lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2650  while (lip != NULL) {
2651  if (lip->li_type == XFS_LI_EFI) {
2652  efip = (xfs_efi_log_item_t *)lip;
2653  if (efip->efi_format.efi_id == efi_id) {
2654  /*
2655  * xfs_trans_ail_delete() drops the
2656  * AIL lock.
2657  */
2658  xfs_trans_ail_delete(ailp, lip,
2659  SHUTDOWN_CORRUPT_INCORE);
2660  xfs_efi_item_free(efip);
2661  spin_lock(&ailp->xa_lock);
2662  break;
2663  }
2664  }
2665  lip = xfs_trans_ail_cursor_next(ailp, &cur);
2666  }
2667  xfs_trans_ail_cursor_done(ailp, &cur);
2668  spin_unlock(&ailp->xa_lock);
2669 
2670  return 0;
2671 }
2672 
2673 /*
2674  * Free up any resources allocated by the transaction
2675  *
2676  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2677  */
2678 STATIC void
2680  struct xlog_recover *trans)
2681 {
2683  int i;
2684 
2685  list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2686  /* Free the regions in the item. */
2687  list_del(&item->ri_list);
2688  for (i = 0; i < item->ri_cnt; i++)
2689  kmem_free(item->ri_buf[i].i_addr);
2690  /* Free the item itself */
2691  kmem_free(item->ri_buf);
2692  kmem_free(item);
2693  }
2694  /* Free the transaction recover structure */
2695  kmem_free(trans);
2696 }
2697 
2698 STATIC int
2700  struct xlog *log,
2701  struct xlog_recover *trans,
2702  struct xlog_recover_item *item)
2703 {
2704  trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2705 
2706  switch (ITEM_TYPE(item)) {
2707  case XFS_LI_BUF:
2708  return xlog_recover_buffer_pass1(log, item);
2709  case XFS_LI_QUOTAOFF:
2710  return xlog_recover_quotaoff_pass1(log, item);
2711  case XFS_LI_INODE:
2712  case XFS_LI_EFI:
2713  case XFS_LI_EFD:
2714  case XFS_LI_DQUOT:
2715  /* nothing to do in pass 1 */
2716  return 0;
2717  default:
2718  xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2719  __func__, ITEM_TYPE(item));
2720  ASSERT(0);
2721  return XFS_ERROR(EIO);
2722  }
2723 }
2724 
2725 STATIC int
2727  struct xlog *log,
2728  struct xlog_recover *trans,
2729  struct list_head *buffer_list,
2730  struct xlog_recover_item *item)
2731 {
2732  trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2733 
2734  switch (ITEM_TYPE(item)) {
2735  case XFS_LI_BUF:
2736  return xlog_recover_buffer_pass2(log, buffer_list, item);
2737  case XFS_LI_INODE:
2738  return xlog_recover_inode_pass2(log, buffer_list, item);
2739  case XFS_LI_EFI:
2740  return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2741  case XFS_LI_EFD:
2742  return xlog_recover_efd_pass2(log, item);
2743  case XFS_LI_DQUOT:
2744  return xlog_recover_dquot_pass2(log, buffer_list, item);
2745  case XFS_LI_QUOTAOFF:
2746  /* nothing to do in pass2 */
2747  return 0;
2748  default:
2749  xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2750  __func__, ITEM_TYPE(item));
2751  ASSERT(0);
2752  return XFS_ERROR(EIO);
2753  }
2754 }
2755 
2756 /*
2757  * Perform the transaction.
2758  *
2759  * If the transaction modifies a buffer or inode, do it now. Otherwise,
2760  * EFIs and EFDs get queued up by adding entries into the AIL for them.
2761  */
2762 STATIC int
2764  struct xlog *log,
2765  struct xlog_recover *trans,
2766  int pass)
2767 {
2768  int error = 0, error2;
2770  LIST_HEAD (buffer_list);
2771 
2772  hlist_del(&trans->r_list);
2773 
2774  error = xlog_recover_reorder_trans(log, trans, pass);
2775  if (error)
2776  return error;
2777 
2778  list_for_each_entry(item, &trans->r_itemq, ri_list) {
2779  switch (pass) {
2780  case XLOG_RECOVER_PASS1:
2781  error = xlog_recover_commit_pass1(log, trans, item);
2782  break;
2783  case XLOG_RECOVER_PASS2:
2784  error = xlog_recover_commit_pass2(log, trans,
2785  &buffer_list, item);
2786  break;
2787  default:
2788  ASSERT(0);
2789  }
2790 
2791  if (error)
2792  goto out;
2793  }
2794 
2795  xlog_recover_free_trans(trans);
2796 
2797 out:
2798  error2 = xfs_buf_delwri_submit(&buffer_list);
2799  return error ? error : error2;
2800 }
2801 
2802 STATIC int
2804  struct xlog *log,
2805  struct xlog_recover *trans)
2806 {
2807  /* Do nothing now */
2808  xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2809  return 0;
2810 }
2811 
2812 /*
2813  * There are two valid states of the r_state field. 0 indicates that the
2814  * transaction structure is in a normal state. We have either seen the
2815  * start of the transaction or the last operation we added was not a partial
2816  * operation. If the last operation we added to the transaction was a
2817  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2818  *
2819  * NOTE: skip LRs with 0 data length.
2820  */
2821 STATIC int
2823  struct xlog *log,
2824  struct hlist_head rhash[],
2825  struct xlog_rec_header *rhead,
2826  xfs_caddr_t dp,
2827  int pass)
2828 {
2829  xfs_caddr_t lp;
2830  int num_logops;
2831  xlog_op_header_t *ohead;
2833  xlog_tid_t tid;
2834  int error;
2835  unsigned long hash;
2836  uint flags;
2837 
2838  lp = dp + be32_to_cpu(rhead->h_len);
2839  num_logops = be32_to_cpu(rhead->h_num_logops);
2840 
2841  /* check the log format matches our own - else we can't recover */
2842  if (xlog_header_check_recover(log->l_mp, rhead))
2843  return (XFS_ERROR(EIO));
2844 
2845  while ((dp < lp) && num_logops) {
2846  ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2847  ohead = (xlog_op_header_t *)dp;
2848  dp += sizeof(xlog_op_header_t);
2849  if (ohead->oh_clientid != XFS_TRANSACTION &&
2850  ohead->oh_clientid != XFS_LOG) {
2851  xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2852  __func__, ohead->oh_clientid);
2853  ASSERT(0);
2854  return (XFS_ERROR(EIO));
2855  }
2856  tid = be32_to_cpu(ohead->oh_tid);
2857  hash = XLOG_RHASH(tid);
2858  trans = xlog_recover_find_tid(&rhash[hash], tid);
2859  if (trans == NULL) { /* not found; add new tid */
2860  if (ohead->oh_flags & XLOG_START_TRANS)
2861  xlog_recover_new_tid(&rhash[hash], tid,
2862  be64_to_cpu(rhead->h_lsn));
2863  } else {
2864  if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2865  xfs_warn(log->l_mp, "%s: bad length 0x%x",
2866  __func__, be32_to_cpu(ohead->oh_len));
2867  WARN_ON(1);
2868  return (XFS_ERROR(EIO));
2869  }
2870  flags = ohead->oh_flags & ~XLOG_END_TRANS;
2871  if (flags & XLOG_WAS_CONT_TRANS)
2872  flags &= ~XLOG_CONTINUE_TRANS;
2873  switch (flags) {
2874  case XLOG_COMMIT_TRANS:
2875  error = xlog_recover_commit_trans(log,
2876  trans, pass);
2877  break;
2878  case XLOG_UNMOUNT_TRANS:
2879  error = xlog_recover_unmount_trans(log, trans);
2880  break;
2881  case XLOG_WAS_CONT_TRANS:
2882  error = xlog_recover_add_to_cont_trans(log,
2883  trans, dp,
2884  be32_to_cpu(ohead->oh_len));
2885  break;
2886  case XLOG_START_TRANS:
2887  xfs_warn(log->l_mp, "%s: bad transaction",
2888  __func__);
2889  ASSERT(0);
2890  error = XFS_ERROR(EIO);
2891  break;
2892  case 0:
2893  case XLOG_CONTINUE_TRANS:
2894  error = xlog_recover_add_to_trans(log, trans,
2895  dp, be32_to_cpu(ohead->oh_len));
2896  break;
2897  default:
2898  xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2899  __func__, flags);
2900  ASSERT(0);
2901  error = XFS_ERROR(EIO);
2902  break;
2903  }
2904  if (error)
2905  return error;
2906  }
2907  dp += be32_to_cpu(ohead->oh_len);
2908  num_logops--;
2909  }
2910  return 0;
2911 }
2912 
2913 /*
2914  * Process an extent free intent item that was recovered from
2915  * the log. We need to free the extents that it describes.
2916  */
2917 STATIC int
2919  xfs_mount_t *mp,
2920  xfs_efi_log_item_t *efip)
2921 {
2922  xfs_efd_log_item_t *efdp;
2923  xfs_trans_t *tp;
2924  int i;
2925  int error = 0;
2926  xfs_extent_t *extp;
2927  xfs_fsblock_t startblock_fsb;
2928 
2929  ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
2930 
2931  /*
2932  * First check the validity of the extents described by the
2933  * EFI. If any are bad, then assume that all are bad and
2934  * just toss the EFI.
2935  */
2936  for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2937  extp = &(efip->efi_format.efi_extents[i]);
2938  startblock_fsb = XFS_BB_TO_FSB(mp,
2939  XFS_FSB_TO_DADDR(mp, extp->ext_start));
2940  if ((startblock_fsb == 0) ||
2941  (extp->ext_len == 0) ||
2942  (startblock_fsb >= mp->m_sb.sb_dblocks) ||
2943  (extp->ext_len >= mp->m_sb.sb_agblocks)) {
2944  /*
2945  * This will pull the EFI from the AIL and
2946  * free the memory associated with it.
2947  */
2948  xfs_efi_release(efip, efip->efi_format.efi_nextents);
2949  return XFS_ERROR(EIO);
2950  }
2951  }
2952 
2953  tp = xfs_trans_alloc(mp, 0);
2954  error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
2955  if (error)
2956  goto abort_error;
2957  efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
2958 
2959  for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2960  extp = &(efip->efi_format.efi_extents[i]);
2961  error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
2962  if (error)
2963  goto abort_error;
2964  xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
2965  extp->ext_len);
2966  }
2967 
2968  set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
2969  error = xfs_trans_commit(tp, 0);
2970  return error;
2971 
2972 abort_error:
2974  return error;
2975 }
2976 
2977 /*
2978  * When this is called, all of the EFIs which did not have
2979  * corresponding EFDs should be in the AIL. What we do now
2980  * is free the extents associated with each one.
2981  *
2982  * Since we process the EFIs in normal transactions, they
2983  * will be removed at some point after the commit. This prevents
2984  * us from just walking down the list processing each one.
2985  * We'll use a flag in the EFI to skip those that we've already
2986  * processed and use the AIL iteration mechanism's generation
2987  * count to try to speed this up at least a bit.
2988  *
2989  * When we start, we know that the EFIs are the only things in
2990  * the AIL. As we process them, however, other items are added
2991  * to the AIL. Since everything added to the AIL must come after
2992  * everything already in the AIL, we stop processing as soon as
2993  * we see something other than an EFI in the AIL.
2994  */
2995 STATIC int
2997  struct xlog *log)
2998 {
2999  xfs_log_item_t *lip;
3000  xfs_efi_log_item_t *efip;
3001  int error = 0;
3002  struct xfs_ail_cursor cur;
3003  struct xfs_ail *ailp;
3004 
3005  ailp = log->l_ailp;
3006  spin_lock(&ailp->xa_lock);
3007  lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3008  while (lip != NULL) {
3009  /*
3010  * We're done when we see something other than an EFI.
3011  * There should be no EFIs left in the AIL now.
3012  */
3013  if (lip->li_type != XFS_LI_EFI) {
3014 #ifdef DEBUG
3015  for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3016  ASSERT(lip->li_type != XFS_LI_EFI);
3017 #endif
3018  break;
3019  }
3020 
3021  /*
3022  * Skip EFIs that we've already processed.
3023  */
3024  efip = (xfs_efi_log_item_t *)lip;
3025  if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3026  lip = xfs_trans_ail_cursor_next(ailp, &cur);
3027  continue;
3028  }
3029 
3030  spin_unlock(&ailp->xa_lock);
3031  error = xlog_recover_process_efi(log->l_mp, efip);
3032  spin_lock(&ailp->xa_lock);
3033  if (error)
3034  goto out;
3035  lip = xfs_trans_ail_cursor_next(ailp, &cur);
3036  }
3037 out:
3038  xfs_trans_ail_cursor_done(ailp, &cur);
3039  spin_unlock(&ailp->xa_lock);
3040  return error;
3041 }
3042 
3043 /*
3044  * This routine performs a transaction to null out a bad inode pointer
3045  * in an agi unlinked inode hash bucket.
3046  */
3047 STATIC void
3049  xfs_mount_t *mp,
3050  xfs_agnumber_t agno,
3051  int bucket)
3052 {
3053  xfs_trans_t *tp;
3054  xfs_agi_t *agi;
3055  xfs_buf_t *agibp;
3056  int offset;
3057  int error;
3058 
3061  0, 0, 0);
3062  if (error)
3063  goto out_abort;
3064 
3065  error = xfs_read_agi(mp, tp, agno, &agibp);
3066  if (error)
3067  goto out_abort;
3068 
3069  agi = XFS_BUF_TO_AGI(agibp);
3070  agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3071  offset = offsetof(xfs_agi_t, agi_unlinked) +
3072  (sizeof(xfs_agino_t) * bucket);
3073  xfs_trans_log_buf(tp, agibp, offset,
3074  (offset + sizeof(xfs_agino_t) - 1));
3075 
3076  error = xfs_trans_commit(tp, 0);
3077  if (error)
3078  goto out_error;
3079  return;
3080 
3081 out_abort:
3083 out_error:
3084  xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3085  return;
3086 }
3087 
3090  struct xfs_mount *mp,
3091  xfs_agnumber_t agno,
3092  xfs_agino_t agino,
3093  int bucket)
3094 {
3095  struct xfs_buf *ibp;
3096  struct xfs_dinode *dip;
3097  struct xfs_inode *ip;
3098  xfs_ino_t ino;
3099  int error;
3100 
3101  ino = XFS_AGINO_TO_INO(mp, agno, agino);
3102  error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3103  if (error)
3104  goto fail;
3105 
3106  /*
3107  * Get the on disk inode to find the next inode in the bucket.
3108  */
3109  error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
3110  if (error)
3111  goto fail_iput;
3112 
3113  ASSERT(ip->i_d.di_nlink == 0);
3114  ASSERT(ip->i_d.di_mode != 0);
3115 
3116  /* setup for the next pass */
3117  agino = be32_to_cpu(dip->di_next_unlinked);
3118  xfs_buf_relse(ibp);
3119 
3120  /*
3121  * Prevent any DMAPI event from being sent when the reference on
3122  * the inode is dropped.
3123  */
3124  ip->i_d.di_dmevmask = 0;
3125 
3126  IRELE(ip);
3127  return agino;
3128 
3129  fail_iput:
3130  IRELE(ip);
3131  fail:
3132  /*
3133  * We can't read in the inode this bucket points to, or this inode
3134  * is messed up. Just ditch this bucket of inodes. We will lose
3135  * some inodes and space, but at least we won't hang.
3136  *
3137  * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3138  * clear the inode pointer in the bucket.
3139  */
3140  xlog_recover_clear_agi_bucket(mp, agno, bucket);
3141  return NULLAGINO;
3142 }
3143 
3144 /*
3145  * xlog_iunlink_recover
3146  *
3147  * This is called during recovery to process any inodes which
3148  * we unlinked but not freed when the system crashed. These
3149  * inodes will be on the lists in the AGI blocks. What we do
3150  * here is scan all the AGIs and fully truncate and free any
3151  * inodes found on the lists. Each inode is removed from the
3152  * lists when it has been fully truncated and is freed. The
3153  * freeing of the inode and its removal from the list must be
3154  * atomic.
3155  */
3156 STATIC void
3158  struct xlog *log)
3159 {
3160  xfs_mount_t *mp;
3161  xfs_agnumber_t agno;
3162  xfs_agi_t *agi;
3163  xfs_buf_t *agibp;
3164  xfs_agino_t agino;
3165  int bucket;
3166  int error;
3167  uint mp_dmevmask;
3168 
3169  mp = log->l_mp;
3170 
3171  /*
3172  * Prevent any DMAPI event from being sent while in this function.
3173  */
3174  mp_dmevmask = mp->m_dmevmask;
3175  mp->m_dmevmask = 0;
3176 
3177  for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3178  /*
3179  * Find the agi for this ag.
3180  */
3181  error = xfs_read_agi(mp, NULL, agno, &agibp);
3182  if (error) {
3183  /*
3184  * AGI is b0rked. Don't process it.
3185  *
3186  * We should probably mark the filesystem as corrupt
3187  * after we've recovered all the ag's we can....
3188  */
3189  continue;
3190  }
3191  /*
3192  * Unlock the buffer so that it can be acquired in the normal
3193  * course of the transaction to truncate and free each inode.
3194  * Because we are not racing with anyone else here for the AGI
3195  * buffer, we don't even need to hold it locked to read the
3196  * initial unlinked bucket entries out of the buffer. We keep
3197  * buffer reference though, so that it stays pinned in memory
3198  * while we need the buffer.
3199  */
3200  agi = XFS_BUF_TO_AGI(agibp);
3201  xfs_buf_unlock(agibp);
3202 
3203  for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3204  agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3205  while (agino != NULLAGINO) {
3207  agno, agino, bucket);
3208  }
3209  }
3210  xfs_buf_rele(agibp);
3211  }
3212 
3213  mp->m_dmevmask = mp_dmevmask;
3214 }
3215 
3216 
3217 #ifdef DEBUG
3218 STATIC void
3220  struct xlog *log,
3221  struct xlog_in_core *iclog,
3222  int size)
3223 {
3224  int i;
3225  __be32 *up;
3226  uint chksum = 0;
3227 
3228  up = (__be32 *)iclog->ic_datap;
3229  /* divide length by 4 to get # words */
3230  for (i = 0; i < (size >> 2); i++) {
3231  chksum ^= be32_to_cpu(*up);
3232  up++;
3233  }
3234  iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235 }
3236 #else
3237 #define xlog_pack_data_checksum(log, iclog, size)
3238 #endif
3239 
3240 /*
3241  * Stamp cycle number in every block
3242  */
3243 void
3245  struct xlog *log,
3246  struct xlog_in_core *iclog,
3247  int roundoff)
3248 {
3249  int i, j, k;
3250  int size = iclog->ic_offset + roundoff;
3251  __be32 cycle_lsn;
3252  xfs_caddr_t dp;
3253 
3254  xlog_pack_data_checksum(log, iclog, size);
3255 
3256  cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3257 
3258  dp = iclog->ic_datap;
3259  for (i = 0; i < BTOBB(size) &&
3260  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261  iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262  *(__be32 *)dp = cycle_lsn;
3263  dp += BBSIZE;
3264  }
3265 
3266  if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267  xlog_in_core_2_t *xhdr = iclog->ic_data;
3268 
3269  for ( ; i < BTOBB(size); i++) {
3270  j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271  k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272  xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273  *(__be32 *)dp = cycle_lsn;
3274  dp += BBSIZE;
3275  }
3276 
3277  for (i = 1; i < log->l_iclog_heads; i++) {
3278  xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3279  }
3280  }
3281 }
3282 
3283 STATIC void
3285  struct xlog_rec_header *rhead,
3286  xfs_caddr_t dp,
3287  struct xlog *log)
3288 {
3289  int i, j, k;
3290 
3291  for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3293  *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3294  dp += BBSIZE;
3295  }
3296 
3297  if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3298  xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3299  for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3300  j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3301  k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3302  *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3303  dp += BBSIZE;
3304  }
3305  }
3306 }
3307 
3308 STATIC int
3310  struct xlog *log,
3311  struct xlog_rec_header *rhead,
3312  xfs_daddr_t blkno)
3313 {
3314  int hlen;
3315 
3317  XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3318  XFS_ERRLEVEL_LOW, log->l_mp);
3319  return XFS_ERROR(EFSCORRUPTED);
3320  }
3321  if (unlikely(
3322  (!rhead->h_version ||
3323  (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3324  xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3325  __func__, be32_to_cpu(rhead->h_version));
3326  return XFS_ERROR(EIO);
3327  }
3328 
3329  /* LR body must have data or it wouldn't have been written */
3330  hlen = be32_to_cpu(rhead->h_len);
3331  if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3332  XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3333  XFS_ERRLEVEL_LOW, log->l_mp);
3334  return XFS_ERROR(EFSCORRUPTED);
3335  }
3336  if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3337  XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3338  XFS_ERRLEVEL_LOW, log->l_mp);
3339  return XFS_ERROR(EFSCORRUPTED);
3340  }
3341  return 0;
3342 }
3343 
3344 /*
3345  * Read the log from tail to head and process the log records found.
3346  * Handle the two cases where the tail and head are in the same cycle
3347  * and where the active portion of the log wraps around the end of
3348  * the physical log separately. The pass parameter is passed through
3349  * to the routines called to process the data and is not looked at
3350  * here.
3351  */
3352 STATIC int
3354  struct xlog *log,
3355  xfs_daddr_t head_blk,
3356  xfs_daddr_t tail_blk,
3357  int pass)
3358 {
3359  xlog_rec_header_t *rhead;
3360  xfs_daddr_t blk_no;
3361  xfs_caddr_t offset;
3362  xfs_buf_t *hbp, *dbp;
3363  int error = 0, h_size;
3364  int bblks, split_bblks;
3365  int hblks, split_hblks, wrapped_hblks;
3366  struct hlist_head rhash[XLOG_RHASH_SIZE];
3367 
3368  ASSERT(head_blk != tail_blk);
3369 
3370  /*
3371  * Read the header of the tail block and get the iclog buffer size from
3372  * h_size. Use this to tell how many sectors make up the log header.
3373  */
3374  if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3375  /*
3376  * When using variable length iclogs, read first sector of
3377  * iclog header and extract the header size from it. Get a
3378  * new hbp that is the correct size.
3379  */
3380  hbp = xlog_get_bp(log, 1);
3381  if (!hbp)
3382  return ENOMEM;
3383 
3384  error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3385  if (error)
3386  goto bread_err1;
3387 
3388  rhead = (xlog_rec_header_t *)offset;
3389  error = xlog_valid_rec_header(log, rhead, tail_blk);
3390  if (error)
3391  goto bread_err1;
3392  h_size = be32_to_cpu(rhead->h_size);
3393  if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3394  (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3395  hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3396  if (h_size % XLOG_HEADER_CYCLE_SIZE)
3397  hblks++;
3398  xlog_put_bp(hbp);
3399  hbp = xlog_get_bp(log, hblks);
3400  } else {
3401  hblks = 1;
3402  }
3403  } else {
3404  ASSERT(log->l_sectBBsize == 1);
3405  hblks = 1;
3406  hbp = xlog_get_bp(log, 1);
3407  h_size = XLOG_BIG_RECORD_BSIZE;
3408  }
3409 
3410  if (!hbp)
3411  return ENOMEM;
3412  dbp = xlog_get_bp(log, BTOBB(h_size));
3413  if (!dbp) {
3414  xlog_put_bp(hbp);
3415  return ENOMEM;
3416  }
3417 
3418  memset(rhash, 0, sizeof(rhash));
3419  if (tail_blk <= head_blk) {
3420  for (blk_no = tail_blk; blk_no < head_blk; ) {
3421  error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3422  if (error)
3423  goto bread_err2;
3424 
3425  rhead = (xlog_rec_header_t *)offset;
3426  error = xlog_valid_rec_header(log, rhead, blk_no);
3427  if (error)
3428  goto bread_err2;
3429 
3430  /* blocks in data section */
3431  bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3432  error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3433  &offset);
3434  if (error)
3435  goto bread_err2;
3436 
3437  xlog_unpack_data(rhead, offset, log);
3438  if ((error = xlog_recover_process_data(log,
3439  rhash, rhead, offset, pass)))
3440  goto bread_err2;
3441  blk_no += bblks + hblks;
3442  }
3443  } else {
3444  /*
3445  * Perform recovery around the end of the physical log.
3446  * When the head is not on the same cycle number as the tail,
3447  * we can't do a sequential recovery as above.
3448  */
3449  blk_no = tail_blk;
3450  while (blk_no < log->l_logBBsize) {
3451  /*
3452  * Check for header wrapping around physical end-of-log
3453  */
3454  offset = hbp->b_addr;
3455  split_hblks = 0;
3456  wrapped_hblks = 0;
3457  if (blk_no + hblks <= log->l_logBBsize) {
3458  /* Read header in one read */
3459  error = xlog_bread(log, blk_no, hblks, hbp,
3460  &offset);
3461  if (error)
3462  goto bread_err2;
3463  } else {
3464  /* This LR is split across physical log end */
3465  if (blk_no != log->l_logBBsize) {
3466  /* some data before physical log end */
3467  ASSERT(blk_no <= INT_MAX);
3468  split_hblks = log->l_logBBsize - (int)blk_no;
3469  ASSERT(split_hblks > 0);
3470  error = xlog_bread(log, blk_no,
3471  split_hblks, hbp,
3472  &offset);
3473  if (error)
3474  goto bread_err2;
3475  }
3476 
3477  /*
3478  * Note: this black magic still works with
3479  * large sector sizes (non-512) only because:
3480  * - we increased the buffer size originally
3481  * by 1 sector giving us enough extra space
3482  * for the second read;
3483  * - the log start is guaranteed to be sector
3484  * aligned;
3485  * - we read the log end (LR header start)
3486  * _first_, then the log start (LR header end)
3487  * - order is important.
3488  */
3489  wrapped_hblks = hblks - split_hblks;
3490  error = xlog_bread_offset(log, 0,
3491  wrapped_hblks, hbp,
3492  offset + BBTOB(split_hblks));
3493  if (error)
3494  goto bread_err2;
3495  }
3496  rhead = (xlog_rec_header_t *)offset;
3497  error = xlog_valid_rec_header(log, rhead,
3498  split_hblks ? blk_no : 0);
3499  if (error)
3500  goto bread_err2;
3501 
3502  bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3503  blk_no += hblks;
3504 
3505  /* Read in data for log record */
3506  if (blk_no + bblks <= log->l_logBBsize) {
3507  error = xlog_bread(log, blk_no, bblks, dbp,
3508  &offset);
3509  if (error)
3510  goto bread_err2;
3511  } else {
3512  /* This log record is split across the
3513  * physical end of log */
3514  offset = dbp->b_addr;
3515  split_bblks = 0;
3516  if (blk_no != log->l_logBBsize) {
3517  /* some data is before the physical
3518  * end of log */
3519  ASSERT(!wrapped_hblks);
3520  ASSERT(blk_no <= INT_MAX);
3521  split_bblks =
3522  log->l_logBBsize - (int)blk_no;
3523  ASSERT(split_bblks > 0);
3524  error = xlog_bread(log, blk_no,
3525  split_bblks, dbp,
3526  &offset);
3527  if (error)
3528  goto bread_err2;
3529  }
3530 
3531  /*
3532  * Note: this black magic still works with
3533  * large sector sizes (non-512) only because:
3534  * - we increased the buffer size originally
3535  * by 1 sector giving us enough extra space
3536  * for the second read;
3537  * - the log start is guaranteed to be sector
3538  * aligned;
3539  * - we read the log end (LR header start)
3540  * _first_, then the log start (LR header end)
3541  * - order is important.
3542  */
3543  error = xlog_bread_offset(log, 0,
3544  bblks - split_bblks, dbp,
3545  offset + BBTOB(split_bblks));
3546  if (error)
3547  goto bread_err2;
3548  }
3549  xlog_unpack_data(rhead, offset, log);
3550  if ((error = xlog_recover_process_data(log, rhash,
3551  rhead, offset, pass)))
3552  goto bread_err2;
3553  blk_no += bblks;
3554  }
3555 
3556  ASSERT(blk_no >= log->l_logBBsize);
3557  blk_no -= log->l_logBBsize;
3558 
3559  /* read first part of physical log */
3560  while (blk_no < head_blk) {
3561  error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3562  if (error)
3563  goto bread_err2;
3564 
3565  rhead = (xlog_rec_header_t *)offset;
3566  error = xlog_valid_rec_header(log, rhead, blk_no);
3567  if (error)
3568  goto bread_err2;
3569 
3570  bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3571  error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3572  &offset);
3573  if (error)
3574  goto bread_err2;
3575 
3576  xlog_unpack_data(rhead, offset, log);
3577  if ((error = xlog_recover_process_data(log, rhash,
3578  rhead, offset, pass)))
3579  goto bread_err2;
3580  blk_no += bblks + hblks;
3581  }
3582  }
3583 
3584  bread_err2:
3585  xlog_put_bp(dbp);
3586  bread_err1:
3587  xlog_put_bp(hbp);
3588  return error;
3589 }
3590 
3591 /*
3592  * Do the recovery of the log. We actually do this in two phases.
3593  * The two passes are necessary in order to implement the function
3594  * of cancelling a record written into the log. The first pass
3595  * determines those things which have been cancelled, and the
3596  * second pass replays log items normally except for those which
3597  * have been cancelled. The handling of the replay and cancellations
3598  * takes place in the log item type specific routines.
3599  *
3600  * The table of items which have cancel records in the log is allocated
3601  * and freed at this level, since only here do we know when all of
3602  * the log recovery has been completed.
3603  */
3604 STATIC int
3606  struct xlog *log,
3607  xfs_daddr_t head_blk,
3608  xfs_daddr_t tail_blk)
3609 {
3610  int error, i;
3611 
3612  ASSERT(head_blk != tail_blk);
3613 
3614  /*
3615  * First do a pass to find all of the cancelled buf log items.
3616  * Store them in the buf_cancel_table for use in the second pass.
3617  */
3618  log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3619  sizeof(struct list_head),
3620  KM_SLEEP);
3621  for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3622  INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3623 
3624  error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3626  if (error != 0) {
3627  kmem_free(log->l_buf_cancel_table);
3628  log->l_buf_cancel_table = NULL;
3629  return error;
3630  }
3631  /*
3632  * Then do a second pass to actually recover the items in the log.
3633  * When it is complete free the table of buf cancel items.
3634  */
3635  error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3637 #ifdef DEBUG
3638  if (!error) {
3639  int i;
3640 
3641  for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3642  ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3643  }
3644 #endif /* DEBUG */
3645 
3646  kmem_free(log->l_buf_cancel_table);
3647  log->l_buf_cancel_table = NULL;
3648 
3649  return error;
3650 }
3651 
3652 /*
3653  * Do the actual recovery
3654  */
3655 STATIC int
3657  struct xlog *log,
3658  xfs_daddr_t head_blk,
3659  xfs_daddr_t tail_blk)
3660 {
3661  int error;
3662  xfs_buf_t *bp;
3663  xfs_sb_t *sbp;
3664 
3665  /*
3666  * First replay the images in the log.
3667  */
3668  error = xlog_do_log_recovery(log, head_blk, tail_blk);
3669  if (error)
3670  return error;
3671 
3672  /*
3673  * If IO errors happened during recovery, bail out.
3674  */
3675  if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3676  return (EIO);
3677  }
3678 
3679  /*
3680  * We now update the tail_lsn since much of the recovery has completed
3681  * and there may be space available to use. If there were no extent
3682  * or iunlinks, we can free up the entire log and set the tail_lsn to
3683  * be the last_sync_lsn. This was set in xlog_find_tail to be the
3684  * lsn of the last known good LR on disk. If there are extent frees
3685  * or iunlinks they will have some entries in the AIL; so we look at
3686  * the AIL to determine how to set the tail_lsn.
3687  */
3688  xlog_assign_tail_lsn(log->l_mp);
3689 
3690  /*
3691  * Now that we've finished replaying all buffer and inode
3692  * updates, re-read in the superblock.
3693  */
3694  bp = xfs_getsb(log->l_mp, 0);
3695  XFS_BUF_UNDONE(bp);
3696  ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697  XFS_BUF_READ(bp);
3698  XFS_BUF_UNASYNC(bp);
3699  xfsbdstrat(log->l_mp, bp);
3700  error = xfs_buf_iowait(bp);
3701  if (error) {
3702  xfs_buf_ioerror_alert(bp, __func__);
3703  ASSERT(0);
3704  xfs_buf_relse(bp);
3705  return error;
3706  }
3707 
3708  /* Convert superblock from on-disk format */
3709  sbp = &log->l_mp->m_sb;
3710  xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
3711  ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712  ASSERT(xfs_sb_good_version(sbp));
3713  xfs_buf_relse(bp);
3714 
3715  /* We've re-read the superblock so re-initialize per-cpu counters */
3716  xfs_icsb_reinit_counters(log->l_mp);
3717 
3719 
3720  /* Normal transactions can now occur */
3721  log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3722  return 0;
3723 }
3724 
3725 /*
3726  * Perform recovery and re-initialize some log variables in xlog_find_tail.
3727  *
3728  * Return error or zero.
3729  */
3730 int
3732  struct xlog *log)
3733 {
3734  xfs_daddr_t head_blk, tail_blk;
3735  int error;
3736 
3737  /* find the tail of the log */
3738  if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3739  return error;
3740 
3741  if (tail_blk != head_blk) {
3742  /* There used to be a comment here:
3743  *
3744  * disallow recovery on read-only mounts. note -- mount
3745  * checks for ENOSPC and turns it into an intelligent
3746  * error message.
3747  * ...but this is no longer true. Now, unless you specify
3748  * NORECOVERY (in which case this function would never be
3749  * called), we just go ahead and recover. We do this all
3750  * under the vfs layer, so we can get away with it unless
3751  * the device itself is read-only, in which case we fail.
3752  */
3753  if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3754  return error;
3755  }
3756 
3757  xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3758  log->l_mp->m_logname ? log->l_mp->m_logname
3759  : "internal");
3760 
3761  error = xlog_do_recover(log, head_blk, tail_blk);
3762  log->l_flags |= XLOG_RECOVERY_NEEDED;
3763  }
3764  return error;
3765 }
3766 
3767 /*
3768  * In the first part of recovery we replay inodes and buffers and build
3769  * up the list of extent free items which need to be processed. Here
3770  * we process the extent free items and clean up the on disk unlinked
3771  * inode lists. This is separated from the first part of recovery so
3772  * that the root and real-time bitmap inodes can be read in from disk in
3773  * between the two stages. This is necessary so that we can free space
3774  * in the real-time portion of the file system.
3775  */
3776 int
3778  struct xlog *log)
3779 {
3780  /*
3781  * Now we're ready to do the transactions needed for the
3782  * rest of recovery. Start with completing all the extent
3783  * free intent records and then process the unlinked inode
3784  * lists. At this point, we essentially run in normal mode
3785  * except that we're still performing recovery actions
3786  * rather than accepting new requests.
3787  */
3788  if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3789  int error;
3790  error = xlog_recover_process_efis(log);
3791  if (error) {
3792  xfs_alert(log->l_mp, "Failed to recover EFIs");
3793  return error;
3794  }
3795  /*
3796  * Sync the log to get all the EFIs out of the AIL.
3797  * This isn't absolutely necessary, but it helps in
3798  * case the unlink transactions would have problems
3799  * pushing the EFIs out of the way.
3800  */
3801  xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3802 
3804 
3806 
3807  xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3808  log->l_mp->m_logname ? log->l_mp->m_logname
3809  : "internal");
3810  log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3811  } else {
3812  xfs_info(log->l_mp, "Ending clean mount");
3813  }
3814  return 0;
3815 }
3816 
3817 
3818 #if defined(DEBUG)
3819 /*
3820  * Read all of the agf and agi counters and check that they
3821  * are consistent with the superblock counters.
3822  */
3823 void
3825  struct xlog *log)
3826 {
3827  xfs_mount_t *mp;
3828  xfs_agf_t *agfp;
3829  xfs_buf_t *agfbp;
3830  xfs_buf_t *agibp;
3831  xfs_agnumber_t agno;
3832  __uint64_t freeblks;
3833  __uint64_t itotal;
3834  __uint64_t ifree;
3835  int error;
3836 
3837  mp = log->l_mp;
3838 
3839  freeblks = 0LL;
3840  itotal = 0LL;
3841  ifree = 0LL;
3842  for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3843  error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3844  if (error) {
3845  xfs_alert(mp, "%s agf read failed agno %d error %d",
3846  __func__, agno, error);
3847  } else {
3848  agfp = XFS_BUF_TO_AGF(agfbp);
3849  freeblks += be32_to_cpu(agfp->agf_freeblks) +
3850  be32_to_cpu(agfp->agf_flcount);
3851  xfs_buf_relse(agfbp);
3852  }
3853 
3854  error = xfs_read_agi(mp, NULL, agno, &agibp);
3855  if (error) {
3856  xfs_alert(mp, "%s agi read failed agno %d error %d",
3857  __func__, agno, error);
3858  } else {
3859  struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3860 
3861  itotal += be32_to_cpu(agi->agi_count);
3862  ifree += be32_to_cpu(agi->agi_freecount);
3863  xfs_buf_relse(agibp);
3864  }
3865  }
3866 }
3867 #endif /* DEBUG */