Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
journal.c
Go to the documentation of this file.
1 /*
2  * linux/fs/jbd/journal.c
3  *
4  * Written by Stephen C. Tweedie <[email protected]>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Generic filesystem journal-writing code; part of the ext2fs
13  * journaling system.
14  *
15  * This file manages journals: areas of disk reserved for logging
16  * transactional updates. This includes the kernel journaling thread
17  * which is responsible for scheduling updates to the log.
18  *
19  * We do not actually manage the physical storage of the journal in this
20  * file: that is left to a per-journal policy function, which allows us
21  * to store the journal within a filesystem-specified area for ext2
22  * journaling (ext2 can use a reserved inode for storing the log).
23  */
24 
25 #include <linux/module.h>
26 #include <linux/time.h>
27 #include <linux/fs.h>
28 #include <linux/jbd.h>
29 #include <linux/errno.h>
30 #include <linux/slab.h>
31 #include <linux/init.h>
32 #include <linux/mm.h>
33 #include <linux/freezer.h>
34 #include <linux/pagemap.h>
35 #include <linux/kthread.h>
36 #include <linux/poison.h>
37 #include <linux/proc_fs.h>
38 #include <linux/debugfs.h>
39 #include <linux/ratelimit.h>
40 
41 #define CREATE_TRACE_POINTS
42 #include <trace/events/jbd.h>
43 
44 #include <asm/uaccess.h>
45 #include <asm/page.h>
46 
60 #if 0
61 EXPORT_SYMBOL(journal_sync_buffer);
62 #endif
64 EXPORT_SYMBOL(journal_revoke);
65 
88 
89 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
90 static void __journal_abort_soft (journal_t *journal, int errno);
91 static const char *journal_dev_name(journal_t *journal, char *buffer);
92 
93 /*
94  * Helper function used to manage commit timeouts
95  */
96 
97 static void commit_timeout(unsigned long __data)
98 {
99  struct task_struct * p = (struct task_struct *) __data;
100 
101  wake_up_process(p);
102 }
103 
104 /*
105  * kjournald: The main thread function used to manage a logging device
106  * journal.
107  *
108  * This kernel thread is responsible for two things:
109  *
110  * 1) COMMIT: Every so often we need to commit the current state of the
111  * filesystem to disk. The journal thread is responsible for writing
112  * all of the metadata buffers to disk.
113  *
114  * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
115  * of the data in that part of the log has been rewritten elsewhere on
116  * the disk. Flushing these old buffers to reclaim space in the log is
117  * known as checkpointing, and this thread is responsible for that job.
118  */
119 
120 static int kjournald(void *arg)
121 {
122  journal_t *journal = arg;
124 
125  /*
126  * Set up an interval timer which can be used to trigger a commit wakeup
127  * after the commit interval expires
128  */
129  setup_timer(&journal->j_commit_timer, commit_timeout,
130  (unsigned long)current);
131 
132  set_freezable();
133 
134  /* Record that the journal thread is running */
135  journal->j_task = current;
136  wake_up(&journal->j_wait_done_commit);
137 
138  printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
139  journal->j_commit_interval / HZ);
140 
141  /*
142  * And now, wait forever for commit wakeup events.
143  */
144  spin_lock(&journal->j_state_lock);
145 
146 loop:
147  if (journal->j_flags & JFS_UNMOUNT)
148  goto end_loop;
149 
150  jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
151  journal->j_commit_sequence, journal->j_commit_request);
152 
153  if (journal->j_commit_sequence != journal->j_commit_request) {
154  jbd_debug(1, "OK, requests differ\n");
155  spin_unlock(&journal->j_state_lock);
156  del_timer_sync(&journal->j_commit_timer);
158  spin_lock(&journal->j_state_lock);
159  goto loop;
160  }
161 
162  wake_up(&journal->j_wait_done_commit);
163  if (freezing(current)) {
164  /*
165  * The simpler the better. Flushing journal isn't a
166  * good idea, because that depends on threads that may
167  * be already stopped.
168  */
169  jbd_debug(1, "Now suspending kjournald\n");
170  spin_unlock(&journal->j_state_lock);
171  try_to_freeze();
172  spin_lock(&journal->j_state_lock);
173  } else {
174  /*
175  * We assume on resume that commits are already there,
176  * so we don't sleep
177  */
178  DEFINE_WAIT(wait);
179  int should_sleep = 1;
180 
181  prepare_to_wait(&journal->j_wait_commit, &wait,
183  if (journal->j_commit_sequence != journal->j_commit_request)
184  should_sleep = 0;
185  transaction = journal->j_running_transaction;
186  if (transaction && time_after_eq(jiffies,
187  transaction->t_expires))
188  should_sleep = 0;
189  if (journal->j_flags & JFS_UNMOUNT)
190  should_sleep = 0;
191  if (should_sleep) {
192  spin_unlock(&journal->j_state_lock);
193  schedule();
194  spin_lock(&journal->j_state_lock);
195  }
196  finish_wait(&journal->j_wait_commit, &wait);
197  }
198 
199  jbd_debug(1, "kjournald wakes\n");
200 
201  /*
202  * Were we woken up by a commit wakeup event?
203  */
204  transaction = journal->j_running_transaction;
205  if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
206  journal->j_commit_request = transaction->t_tid;
207  jbd_debug(1, "woke because of timeout\n");
208  }
209  goto loop;
210 
211 end_loop:
212  spin_unlock(&journal->j_state_lock);
213  del_timer_sync(&journal->j_commit_timer);
214  journal->j_task = NULL;
215  wake_up(&journal->j_wait_done_commit);
216  jbd_debug(1, "Journal thread exiting.\n");
217  return 0;
218 }
219 
220 static int journal_start_thread(journal_t *journal)
221 {
222  struct task_struct *t;
223 
224  t = kthread_run(kjournald, journal, "kjournald");
225  if (IS_ERR(t))
226  return PTR_ERR(t);
227 
228  wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
229  return 0;
230 }
231 
232 static void journal_kill_thread(journal_t *journal)
233 {
234  spin_lock(&journal->j_state_lock);
235  journal->j_flags |= JFS_UNMOUNT;
236 
237  while (journal->j_task) {
238  wake_up(&journal->j_wait_commit);
239  spin_unlock(&journal->j_state_lock);
240  wait_event(journal->j_wait_done_commit,
241  journal->j_task == NULL);
242  spin_lock(&journal->j_state_lock);
243  }
244  spin_unlock(&journal->j_state_lock);
245 }
246 
247 /*
248  * journal_write_metadata_buffer: write a metadata buffer to the journal.
249  *
250  * Writes a metadata buffer to a given disk block. The actual IO is not
251  * performed but a new buffer_head is constructed which labels the data
252  * to be written with the correct destination disk block.
253  *
254  * Any magic-number escaping which needs to be done will cause a
255  * copy-out here. If the buffer happens to start with the
256  * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
257  * magic number is only written to the log for descripter blocks. In
258  * this case, we copy the data and replace the first word with 0, and we
259  * return a result code which indicates that this buffer needs to be
260  * marked as an escaped buffer in the corresponding log descriptor
261  * block. The missing word can then be restored when the block is read
262  * during recovery.
263  *
264  * If the source buffer has already been modified by a new transaction
265  * since we took the last commit snapshot, we use the frozen copy of
266  * that data for IO. If we end up using the existing buffer_head's data
267  * for the write, then we *have* to lock the buffer to prevent anyone
268  * else from using and possibly modifying it while the IO is in
269  * progress.
270  *
271  * The function returns a pointer to the buffer_heads to be used for IO.
272  *
273  * We assume that the journal has already been locked in this function.
274  *
275  * Return value:
276  * <0: Error
277  * >=0: Finished OK
278  *
279  * On success:
280  * Bit 0 set == escape performed on the data
281  * Bit 1 set == buffer copy-out performed (kfree the data after IO)
282  */
283 
285  struct journal_head *jh_in,
286  struct journal_head **jh_out,
287  unsigned int blocknr)
288 {
289  int need_copy_out = 0;
290  int done_copy_out = 0;
291  int do_escape = 0;
292  char *mapped_data;
293  struct buffer_head *new_bh;
294  struct journal_head *new_jh;
295  struct page *new_page;
296  unsigned int new_offset;
297  struct buffer_head *bh_in = jh2bh(jh_in);
298  journal_t *journal = transaction->t_journal;
299 
300  /*
301  * The buffer really shouldn't be locked: only the current committing
302  * transaction is allowed to write it, so nobody else is allowed
303  * to do any IO.
304  *
305  * akpm: except if we're journalling data, and write() output is
306  * also part of a shared mapping, and another thread has
307  * decided to launch a writepage() against this buffer.
308  */
309  J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
310 
312  /* keep subsequent assertions sane */
313  new_bh->b_state = 0;
314  init_buffer(new_bh, NULL, NULL);
315  atomic_set(&new_bh->b_count, 1);
316  new_jh = journal_add_journal_head(new_bh); /* This sleeps */
317 
318  /*
319  * If a new transaction has already done a buffer copy-out, then
320  * we use that version of the data for the commit.
321  */
322  jbd_lock_bh_state(bh_in);
323 repeat:
324  if (jh_in->b_frozen_data) {
325  done_copy_out = 1;
326  new_page = virt_to_page(jh_in->b_frozen_data);
327  new_offset = offset_in_page(jh_in->b_frozen_data);
328  } else {
329  new_page = jh2bh(jh_in)->b_page;
330  new_offset = offset_in_page(jh2bh(jh_in)->b_data);
331  }
332 
333  mapped_data = kmap_atomic(new_page);
334  /*
335  * Check for escaping
336  */
337  if (*((__be32 *)(mapped_data + new_offset)) ==
339  need_copy_out = 1;
340  do_escape = 1;
341  }
342  kunmap_atomic(mapped_data);
343 
344  /*
345  * Do we need to do a data copy?
346  */
347  if (need_copy_out && !done_copy_out) {
348  char *tmp;
349 
350  jbd_unlock_bh_state(bh_in);
351  tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
352  jbd_lock_bh_state(bh_in);
353  if (jh_in->b_frozen_data) {
354  jbd_free(tmp, bh_in->b_size);
355  goto repeat;
356  }
357 
358  jh_in->b_frozen_data = tmp;
359  mapped_data = kmap_atomic(new_page);
360  memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
361  kunmap_atomic(mapped_data);
362 
363  new_page = virt_to_page(tmp);
364  new_offset = offset_in_page(tmp);
365  done_copy_out = 1;
366  }
367 
368  /*
369  * Did we need to do an escaping? Now we've done all the
370  * copying, we can finally do so.
371  */
372  if (do_escape) {
373  mapped_data = kmap_atomic(new_page);
374  *((unsigned int *)(mapped_data + new_offset)) = 0;
375  kunmap_atomic(mapped_data);
376  }
377 
378  set_bh_page(new_bh, new_page, new_offset);
379  new_jh->b_transaction = NULL;
380  new_bh->b_size = jh2bh(jh_in)->b_size;
381  new_bh->b_bdev = transaction->t_journal->j_dev;
382  new_bh->b_blocknr = blocknr;
383  set_buffer_mapped(new_bh);
384  set_buffer_dirty(new_bh);
385 
386  *jh_out = new_jh;
387 
388  /*
389  * The to-be-written buffer needs to get moved to the io queue,
390  * and the original buffer whose contents we are shadowing or
391  * copying is moved to the transaction's shadow queue.
392  */
393  JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
394  spin_lock(&journal->j_list_lock);
395  __journal_file_buffer(jh_in, transaction, BJ_Shadow);
396  spin_unlock(&journal->j_list_lock);
397  jbd_unlock_bh_state(bh_in);
398 
399  JBUFFER_TRACE(new_jh, "file as BJ_IO");
400  journal_file_buffer(new_jh, transaction, BJ_IO);
401 
402  return do_escape | (done_copy_out << 1);
403 }
404 
405 /*
406  * Allocation code for the journal file. Manage the space left in the
407  * journal, so that we can begin checkpointing when appropriate.
408  */
409 
410 /*
411  * __log_space_left: Return the number of free blocks left in the journal.
412  *
413  * Called with the journal already locked.
414  *
415  * Called under j_state_lock
416  */
417 
418 int __log_space_left(journal_t *journal)
419 {
420  int left = journal->j_free;
421 
422  assert_spin_locked(&journal->j_state_lock);
423 
424  /*
425  * Be pessimistic here about the number of those free blocks which
426  * might be required for log descriptor control blocks.
427  */
428 
429 #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
430 
431  left -= MIN_LOG_RESERVED_BLOCKS;
432 
433  if (left <= 0)
434  return 0;
435  left -= (left >> 3);
436  return left;
437 }
438 
439 /*
440  * Called under j_state_lock. Returns true if a transaction commit was started.
441  */
442 int __log_start_commit(journal_t *journal, tid_t target)
443 {
444  /*
445  * The only transaction we can possibly wait upon is the
446  * currently running transaction (if it exists). Otherwise,
447  * the target tid must be an old one.
448  */
449  if (journal->j_running_transaction &&
450  journal->j_running_transaction->t_tid == target) {
451  /*
452  * We want a new commit: OK, mark the request and wakeup the
453  * commit thread. We do _not_ do the commit ourselves.
454  */
455 
456  journal->j_commit_request = target;
457  jbd_debug(1, "JBD: requesting commit %d/%d\n",
458  journal->j_commit_request,
459  journal->j_commit_sequence);
460  wake_up(&journal->j_wait_commit);
461  return 1;
462  } else if (!tid_geq(journal->j_commit_request, target))
463  /* This should never happen, but if it does, preserve
464  the evidence before kjournald goes into a loop and
465  increments j_commit_sequence beyond all recognition. */
466  WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
467  journal->j_commit_request, journal->j_commit_sequence,
468  target, journal->j_running_transaction ?
469  journal->j_running_transaction->t_tid : 0);
470  return 0;
471 }
472 
473 int log_start_commit(journal_t *journal, tid_t tid)
474 {
475  int ret;
476 
477  spin_lock(&journal->j_state_lock);
478  ret = __log_start_commit(journal, tid);
479  spin_unlock(&journal->j_state_lock);
480  return ret;
481 }
482 
483 /*
484  * Force and wait upon a commit if the calling process is not within
485  * transaction. This is used for forcing out undo-protected data which contains
486  * bitmaps, when the fs is running out of space.
487  *
488  * We can only force the running transaction if we don't have an active handle;
489  * otherwise, we will deadlock.
490  *
491  * Returns true if a transaction was started.
492  */
493 int journal_force_commit_nested(journal_t *journal)
494 {
495  transaction_t *transaction = NULL;
496  tid_t tid;
497 
498  spin_lock(&journal->j_state_lock);
499  if (journal->j_running_transaction && !current->journal_info) {
500  transaction = journal->j_running_transaction;
501  __log_start_commit(journal, transaction->t_tid);
502  } else if (journal->j_committing_transaction)
503  transaction = journal->j_committing_transaction;
504 
505  if (!transaction) {
506  spin_unlock(&journal->j_state_lock);
507  return 0; /* Nothing to retry */
508  }
509 
510  tid = transaction->t_tid;
511  spin_unlock(&journal->j_state_lock);
512  log_wait_commit(journal, tid);
513  return 1;
514 }
515 
516 /*
517  * Start a commit of the current running transaction (if any). Returns true
518  * if a transaction is going to be committed (or is currently already
519  * committing), and fills its tid in at *ptid
520  */
521 int journal_start_commit(journal_t *journal, tid_t *ptid)
522 {
523  int ret = 0;
524 
525  spin_lock(&journal->j_state_lock);
526  if (journal->j_running_transaction) {
527  tid_t tid = journal->j_running_transaction->t_tid;
528 
529  __log_start_commit(journal, tid);
530  /* There's a running transaction and we've just made sure
531  * it's commit has been scheduled. */
532  if (ptid)
533  *ptid = tid;
534  ret = 1;
535  } else if (journal->j_committing_transaction) {
536  /*
537  * If commit has been started, then we have to wait for
538  * completion of that transaction.
539  */
540  if (ptid)
541  *ptid = journal->j_committing_transaction->t_tid;
542  ret = 1;
543  }
544  spin_unlock(&journal->j_state_lock);
545  return ret;
546 }
547 
548 /*
549  * Wait for a specified commit to complete.
550  * The caller may not hold the journal lock.
551  */
552 int log_wait_commit(journal_t *journal, tid_t tid)
553 {
554  int err = 0;
555 
556 #ifdef CONFIG_JBD_DEBUG
557  spin_lock(&journal->j_state_lock);
558  if (!tid_geq(journal->j_commit_request, tid)) {
560  "%s: error: j_commit_request=%d, tid=%d\n",
561  __func__, journal->j_commit_request, tid);
562  }
563  spin_unlock(&journal->j_state_lock);
564 #endif
565  spin_lock(&journal->j_state_lock);
566  if (!tid_geq(journal->j_commit_waited, tid))
567  journal->j_commit_waited = tid;
568  while (tid_gt(tid, journal->j_commit_sequence)) {
569  jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
570  tid, journal->j_commit_sequence);
571  wake_up(&journal->j_wait_commit);
572  spin_unlock(&journal->j_state_lock);
573  wait_event(journal->j_wait_done_commit,
574  !tid_gt(tid, journal->j_commit_sequence));
575  spin_lock(&journal->j_state_lock);
576  }
577  spin_unlock(&journal->j_state_lock);
578 
579  if (unlikely(is_journal_aborted(journal))) {
580  printk(KERN_EMERG "journal commit I/O error\n");
581  err = -EIO;
582  }
583  return err;
584 }
585 
586 /*
587  * Return 1 if a given transaction has not yet sent barrier request
588  * connected with a transaction commit. If 0 is returned, transaction
589  * may or may not have sent the barrier. Used to avoid sending barrier
590  * twice in common cases.
591  */
592 int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
593 {
594  int ret = 0;
595  transaction_t *commit_trans;
596 
597  if (!(journal->j_flags & JFS_BARRIER))
598  return 0;
599  spin_lock(&journal->j_state_lock);
600  /* Transaction already committed? */
601  if (tid_geq(journal->j_commit_sequence, tid))
602  goto out;
603  /*
604  * Transaction is being committed and we already proceeded to
605  * writing commit record?
606  */
607  commit_trans = journal->j_committing_transaction;
608  if (commit_trans && commit_trans->t_tid == tid &&
609  commit_trans->t_state >= T_COMMIT_RECORD)
610  goto out;
611  ret = 1;
612 out:
613  spin_unlock(&journal->j_state_lock);
614  return ret;
615 }
617 
618 /*
619  * Log buffer allocation routines:
620  */
621 
622 int journal_next_log_block(journal_t *journal, unsigned int *retp)
623 {
624  unsigned int blocknr;
625 
626  spin_lock(&journal->j_state_lock);
627  J_ASSERT(journal->j_free > 1);
628 
629  blocknr = journal->j_head;
630  journal->j_head++;
631  journal->j_free--;
632  if (journal->j_head == journal->j_last)
633  journal->j_head = journal->j_first;
634  spin_unlock(&journal->j_state_lock);
635  return journal_bmap(journal, blocknr, retp);
636 }
637 
638 /*
639  * Conversion of logical to physical block numbers for the journal
640  *
641  * On external journals the journal blocks are identity-mapped, so
642  * this is a no-op. If needed, we can use j_blk_offset - everything is
643  * ready.
644  */
645 int journal_bmap(journal_t *journal, unsigned int blocknr,
646  unsigned int *retp)
647 {
648  int err = 0;
649  unsigned int ret;
650 
651  if (journal->j_inode) {
652  ret = bmap(journal->j_inode, blocknr);
653  if (ret)
654  *retp = ret;
655  else {
656  char b[BDEVNAME_SIZE];
657 
658  printk(KERN_ALERT "%s: journal block not found "
659  "at offset %u on %s\n",
660  __func__,
661  blocknr,
662  bdevname(journal->j_dev, b));
663  err = -EIO;
664  __journal_abort_soft(journal, err);
665  }
666  } else {
667  *retp = blocknr; /* +journal->j_blk_offset */
668  }
669  return err;
670 }
671 
672 /*
673  * We play buffer_head aliasing tricks to write data/metadata blocks to
674  * the journal without copying their contents, but for journal
675  * descriptor blocks we do need to generate bona fide buffers.
676  *
677  * After the caller of journal_get_descriptor_buffer() has finished modifying
678  * the buffer's contents they really should run flush_dcache_page(bh->b_page).
679  * But we don't bother doing that, so there will be coherency problems with
680  * mmaps of blockdevs which hold live JBD-controlled filesystems.
681  */
682 struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
683 {
684  struct buffer_head *bh;
685  unsigned int blocknr;
686  int err;
687 
688  err = journal_next_log_block(journal, &blocknr);
689 
690  if (err)
691  return NULL;
692 
693  bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
694  if (!bh)
695  return NULL;
696  lock_buffer(bh);
697  memset(bh->b_data, 0, journal->j_blocksize);
698  set_buffer_uptodate(bh);
699  unlock_buffer(bh);
700  BUFFER_TRACE(bh, "return this buffer");
701  return journal_add_journal_head(bh);
702 }
703 
704 /*
705  * Management for journal control blocks: functions to create and
706  * destroy journal_t structures, and to initialise and read existing
707  * journal blocks from disk. */
708 
709 /* First: create and setup a journal_t object in memory. We initialise
710  * very few fields yet: that has to wait until we have created the
711  * journal structures from from scratch, or loaded them from disk. */
712 
713 static journal_t * journal_init_common (void)
714 {
715  journal_t *journal;
716  int err;
717 
718  journal = kzalloc(sizeof(*journal), GFP_KERNEL);
719  if (!journal)
720  goto fail;
721 
722  init_waitqueue_head(&journal->j_wait_transaction_locked);
723  init_waitqueue_head(&journal->j_wait_logspace);
724  init_waitqueue_head(&journal->j_wait_done_commit);
725  init_waitqueue_head(&journal->j_wait_checkpoint);
726  init_waitqueue_head(&journal->j_wait_commit);
727  init_waitqueue_head(&journal->j_wait_updates);
728  mutex_init(&journal->j_checkpoint_mutex);
729  spin_lock_init(&journal->j_revoke_lock);
730  spin_lock_init(&journal->j_list_lock);
731  spin_lock_init(&journal->j_state_lock);
732 
733  journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
734 
735  /* The journal is marked for error until we succeed with recovery! */
736  journal->j_flags = JFS_ABORT;
737 
738  /* Set up a default-sized revoke table for the new mount. */
739  err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
740  if (err) {
741  kfree(journal);
742  goto fail;
743  }
744  return journal;
745 fail:
746  return NULL;
747 }
748 
749 /* journal_init_dev and journal_init_inode:
750  *
751  * Create a journal structure assigned some fixed set of disk blocks to
752  * the journal. We don't actually touch those disk blocks yet, but we
753  * need to set up all of the mapping information to tell the journaling
754  * system where the journal blocks are.
755  *
756  */
757 
772 journal_t * journal_init_dev(struct block_device *bdev,
773  struct block_device *fs_dev,
774  int start, int len, int blocksize)
775 {
776  journal_t *journal = journal_init_common();
777  struct buffer_head *bh;
778  int n;
779 
780  if (!journal)
781  return NULL;
782 
783  /* journal descriptor can store up to n blocks -bzzz */
784  journal->j_blocksize = blocksize;
785  n = journal->j_blocksize / sizeof(journal_block_tag_t);
786  journal->j_wbufsize = n;
787  journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
788  if (!journal->j_wbuf) {
789  printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
790  __func__);
791  goto out_err;
792  }
793  journal->j_dev = bdev;
794  journal->j_fs_dev = fs_dev;
795  journal->j_blk_offset = start;
796  journal->j_maxlen = len;
797 
798  bh = __getblk(journal->j_dev, start, journal->j_blocksize);
799  if (!bh) {
801  "%s: Cannot get buffer for journal superblock\n",
802  __func__);
803  goto out_err;
804  }
805  journal->j_sb_buffer = bh;
806  journal->j_superblock = (journal_superblock_t *)bh->b_data;
807 
808  return journal;
809 out_err:
810  kfree(journal->j_wbuf);
811  kfree(journal);
812  return NULL;
813 }
814 
823 journal_t * journal_init_inode (struct inode *inode)
824 {
825  struct buffer_head *bh;
826  journal_t *journal = journal_init_common();
827  int err;
828  int n;
829  unsigned int blocknr;
830 
831  if (!journal)
832  return NULL;
833 
834  journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
835  journal->j_inode = inode;
836  jbd_debug(1,
837  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
838  journal, inode->i_sb->s_id, inode->i_ino,
839  (long long) inode->i_size,
840  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
841 
842  journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
843  journal->j_blocksize = inode->i_sb->s_blocksize;
844 
845  /* journal descriptor can store up to n blocks -bzzz */
846  n = journal->j_blocksize / sizeof(journal_block_tag_t);
847  journal->j_wbufsize = n;
848  journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
849  if (!journal->j_wbuf) {
850  printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
851  __func__);
852  goto out_err;
853  }
854 
855  err = journal_bmap(journal, 0, &blocknr);
856  /* If that failed, give up */
857  if (err) {
858  printk(KERN_ERR "%s: Cannot locate journal superblock\n",
859  __func__);
860  goto out_err;
861  }
862 
863  bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
864  if (!bh) {
866  "%s: Cannot get buffer for journal superblock\n",
867  __func__);
868  goto out_err;
869  }
870  journal->j_sb_buffer = bh;
871  journal->j_superblock = (journal_superblock_t *)bh->b_data;
872 
873  return journal;
874 out_err:
875  kfree(journal->j_wbuf);
876  kfree(journal);
877  return NULL;
878 }
879 
880 /*
881  * If the journal init or create aborts, we need to mark the journal
882  * superblock as being NULL to prevent the journal destroy from writing
883  * back a bogus superblock.
884  */
885 static void journal_fail_superblock (journal_t *journal)
886 {
887  struct buffer_head *bh = journal->j_sb_buffer;
888  brelse(bh);
889  journal->j_sb_buffer = NULL;
890 }
891 
892 /*
893  * Given a journal_t structure, initialise the various fields for
894  * startup of a new journaling session. We use this both when creating
895  * a journal, and after recovering an old journal to reset it for
896  * subsequent use.
897  */
898 
899 static int journal_reset(journal_t *journal)
900 {
901  journal_superblock_t *sb = journal->j_superblock;
902  unsigned int first, last;
903 
904  first = be32_to_cpu(sb->s_first);
905  last = be32_to_cpu(sb->s_maxlen);
906  if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
907  printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
908  first, last);
909  journal_fail_superblock(journal);
910  return -EINVAL;
911  }
912 
913  journal->j_first = first;
914  journal->j_last = last;
915 
916  journal->j_head = first;
917  journal->j_tail = first;
918  journal->j_free = last - first;
919 
920  journal->j_tail_sequence = journal->j_transaction_sequence;
921  journal->j_commit_sequence = journal->j_transaction_sequence - 1;
922  journal->j_commit_request = journal->j_commit_sequence;
923 
924  journal->j_max_transaction_buffers = journal->j_maxlen / 4;
925 
926  /*
927  * As a special case, if the on-disk copy is already marked as needing
928  * no recovery (s_start == 0), then we can safely defer the superblock
929  * update until the next commit by setting JFS_FLUSHED. This avoids
930  * attempting a write to a potential-readonly device.
931  */
932  if (sb->s_start == 0) {
933  jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
934  "(start %u, seq %d, errno %d)\n",
935  journal->j_tail, journal->j_tail_sequence,
936  journal->j_errno);
937  journal->j_flags |= JFS_FLUSHED;
938  } else {
939  /* Lock here to make assertions happy... */
940  mutex_lock(&journal->j_checkpoint_mutex);
941  /*
942  * Update log tail information. We use WRITE_FUA since new
943  * transaction will start reusing journal space and so we
944  * must make sure information about current log tail is on
945  * disk before that.
946  */
948  journal->j_tail_sequence,
949  journal->j_tail,
950  WRITE_FUA);
951  mutex_unlock(&journal->j_checkpoint_mutex);
952  }
953  return journal_start_thread(journal);
954 }
955 
964 int journal_create(journal_t *journal)
965 {
966  unsigned int blocknr;
967  struct buffer_head *bh;
969  int i, err;
970 
971  if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
972  printk (KERN_ERR "Journal length (%d blocks) too short.\n",
973  journal->j_maxlen);
974  journal_fail_superblock(journal);
975  return -EINVAL;
976  }
977 
978  if (journal->j_inode == NULL) {
979  /*
980  * We don't know what block to start at!
981  */
983  "%s: creation of journal on external device!\n",
984  __func__);
985  BUG();
986  }
987 
988  /* Zero out the entire journal on disk. We cannot afford to
989  have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
990  jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
991  for (i = 0; i < journal->j_maxlen; i++) {
992  err = journal_bmap(journal, i, &blocknr);
993  if (err)
994  return err;
995  bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
996  if (unlikely(!bh))
997  return -ENOMEM;
998  lock_buffer(bh);
999  memset (bh->b_data, 0, journal->j_blocksize);
1000  BUFFER_TRACE(bh, "marking dirty");
1001  mark_buffer_dirty(bh);
1002  BUFFER_TRACE(bh, "marking uptodate");
1003  set_buffer_uptodate(bh);
1004  unlock_buffer(bh);
1005  __brelse(bh);
1006  }
1007 
1008  sync_blockdev(journal->j_dev);
1009  jbd_debug(1, "JBD: journal cleared.\n");
1010 
1011  /* OK, fill in the initial static fields in the new superblock */
1012  sb = journal->j_superblock;
1013 
1016 
1017  sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1018  sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1019  sb->s_first = cpu_to_be32(1);
1020 
1021  journal->j_transaction_sequence = 1;
1022 
1023  journal->j_flags &= ~JFS_ABORT;
1024  journal->j_format_version = 2;
1025 
1026  return journal_reset(journal);
1027 }
1028 
1029 static void journal_write_superblock(journal_t *journal, int write_op)
1030 {
1031  struct buffer_head *bh = journal->j_sb_buffer;
1032  int ret;
1033 
1034  trace_journal_write_superblock(journal, write_op);
1035  if (!(journal->j_flags & JFS_BARRIER))
1036  write_op &= ~(REQ_FUA | REQ_FLUSH);
1037  lock_buffer(bh);
1038  if (buffer_write_io_error(bh)) {
1039  char b[BDEVNAME_SIZE];
1040  /*
1041  * Oh, dear. A previous attempt to write the journal
1042  * superblock failed. This could happen because the
1043  * USB device was yanked out. Or it could happen to
1044  * be a transient write error and maybe the block will
1045  * be remapped. Nothing we can do but to retry the
1046  * write and hope for the best.
1047  */
1048  printk(KERN_ERR "JBD: previous I/O error detected "
1049  "for journal superblock update for %s.\n",
1050  journal_dev_name(journal, b));
1051  clear_buffer_write_io_error(bh);
1052  set_buffer_uptodate(bh);
1053  }
1054 
1055  get_bh(bh);
1056  bh->b_end_io = end_buffer_write_sync;
1057  ret = submit_bh(write_op, bh);
1058  wait_on_buffer(bh);
1059  if (buffer_write_io_error(bh)) {
1060  clear_buffer_write_io_error(bh);
1061  set_buffer_uptodate(bh);
1062  ret = -EIO;
1063  }
1064  if (ret) {
1065  char b[BDEVNAME_SIZE];
1066  printk(KERN_ERR "JBD: Error %d detected "
1067  "when updating journal superblock for %s.\n",
1068  ret, journal_dev_name(journal, b));
1069  }
1070 }
1071 
1082 void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1083  unsigned int tail_block, int write_op)
1084 {
1085  journal_superblock_t *sb = journal->j_superblock;
1086 
1087  BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1088  jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
1089  tail_block, tail_tid);
1090 
1091  sb->s_sequence = cpu_to_be32(tail_tid);
1092  sb->s_start = cpu_to_be32(tail_block);
1093 
1094  journal_write_superblock(journal, write_op);
1095 
1096  /* Log is no longer empty */
1097  spin_lock(&journal->j_state_lock);
1098  WARN_ON(!sb->s_sequence);
1099  journal->j_flags &= ~JFS_FLUSHED;
1100  spin_unlock(&journal->j_state_lock);
1101 }
1102 
1110 static void mark_journal_empty(journal_t *journal)
1111 {
1112  journal_superblock_t *sb = journal->j_superblock;
1113 
1114  BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1115  spin_lock(&journal->j_state_lock);
1116  /* Is it already empty? */
1117  if (sb->s_start == 0) {
1118  spin_unlock(&journal->j_state_lock);
1119  return;
1120  }
1121  jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
1122  journal->j_tail_sequence);
1123 
1124  sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1125  sb->s_start = cpu_to_be32(0);
1126  spin_unlock(&journal->j_state_lock);
1127 
1128  journal_write_superblock(journal, WRITE_FUA);
1129 
1130  spin_lock(&journal->j_state_lock);
1131  /* Log is empty */
1132  journal->j_flags |= JFS_FLUSHED;
1133  spin_unlock(&journal->j_state_lock);
1134 }
1135 
1143 static void journal_update_sb_errno(journal_t *journal)
1144 {
1145  journal_superblock_t *sb = journal->j_superblock;
1146 
1147  spin_lock(&journal->j_state_lock);
1148  jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
1149  journal->j_errno);
1150  sb->s_errno = cpu_to_be32(journal->j_errno);
1151  spin_unlock(&journal->j_state_lock);
1152 
1153  journal_write_superblock(journal, WRITE_SYNC);
1154 }
1155 
1156 /*
1157  * Read the superblock for a given journal, performing initial
1158  * validation of the format.
1159  */
1160 
1161 static int journal_get_superblock(journal_t *journal)
1162 {
1163  struct buffer_head *bh;
1165  int err = -EIO;
1166 
1167  bh = journal->j_sb_buffer;
1168 
1169  J_ASSERT(bh != NULL);
1170  if (!buffer_uptodate(bh)) {
1171  ll_rw_block(READ, 1, &bh);
1172  wait_on_buffer(bh);
1173  if (!buffer_uptodate(bh)) {
1174  printk (KERN_ERR
1175  "JBD: IO error reading journal superblock\n");
1176  goto out;
1177  }
1178  }
1179 
1180  sb = journal->j_superblock;
1181 
1182  err = -EINVAL;
1183 
1185  sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1186  printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1187  goto out;
1188  }
1189 
1190  switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1191  case JFS_SUPERBLOCK_V1:
1192  journal->j_format_version = 1;
1193  break;
1194  case JFS_SUPERBLOCK_V2:
1195  journal->j_format_version = 2;
1196  break;
1197  default:
1198  printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1199  goto out;
1200  }
1201 
1202  if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1203  journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1204  else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1205  printk (KERN_WARNING "JBD: journal file too short\n");
1206  goto out;
1207  }
1208 
1209  if (be32_to_cpu(sb->s_first) == 0 ||
1210  be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1212  "JBD: Invalid start block of journal: %u\n",
1213  be32_to_cpu(sb->s_first));
1214  goto out;
1215  }
1216 
1217  return 0;
1218 
1219 out:
1220  journal_fail_superblock(journal);
1221  return err;
1222 }
1223 
1224 /*
1225  * Load the on-disk journal superblock and read the key fields into the
1226  * journal_t.
1227  */
1228 
1229 static int load_superblock(journal_t *journal)
1230 {
1231  int err;
1233 
1234  err = journal_get_superblock(journal);
1235  if (err)
1236  return err;
1237 
1238  sb = journal->j_superblock;
1239 
1240  journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1241  journal->j_tail = be32_to_cpu(sb->s_start);
1242  journal->j_first = be32_to_cpu(sb->s_first);
1243  journal->j_last = be32_to_cpu(sb->s_maxlen);
1244  journal->j_errno = be32_to_cpu(sb->s_errno);
1245 
1246  return 0;
1247 }
1248 
1249 
1258 int journal_load(journal_t *journal)
1259 {
1260  int err;
1262 
1263  err = load_superblock(journal);
1264  if (err)
1265  return err;
1266 
1267  sb = journal->j_superblock;
1268  /* If this is a V2 superblock, then we have to check the
1269  * features flags on it. */
1270 
1271  if (journal->j_format_version >= 2) {
1272  if ((sb->s_feature_ro_compat &
1274  (sb->s_feature_incompat &
1277  "JBD: Unrecognised features on journal\n");
1278  return -EINVAL;
1279  }
1280  }
1281 
1282  /* Let the recovery code check whether it needs to recover any
1283  * data from the journal. */
1284  if (journal_recover(journal))
1285  goto recovery_error;
1286 
1287  /* OK, we've finished with the dynamic journal bits:
1288  * reinitialise the dynamic contents of the superblock in memory
1289  * and reset them on disk. */
1290  if (journal_reset(journal))
1291  goto recovery_error;
1292 
1293  journal->j_flags &= ~JFS_ABORT;
1294  journal->j_flags |= JFS_LOADED;
1295  return 0;
1296 
1297 recovery_error:
1298  printk (KERN_WARNING "JBD: recovery failed\n");
1299  return -EIO;
1300 }
1301 
1310 int journal_destroy(journal_t *journal)
1311 {
1312  int err = 0;
1313 
1314 
1315  /* Wait for the commit thread to wake up and die. */
1316  journal_kill_thread(journal);
1317 
1318  /* Force a final log commit */
1319  if (journal->j_running_transaction)
1320  journal_commit_transaction(journal);
1321 
1322  /* Force any old transactions to disk */
1323 
1324  /* We cannot race with anybody but must keep assertions happy */
1325  mutex_lock(&journal->j_checkpoint_mutex);
1326  /* Totally anal locking here... */
1327  spin_lock(&journal->j_list_lock);
1328  while (journal->j_checkpoint_transactions != NULL) {
1329  spin_unlock(&journal->j_list_lock);
1330  log_do_checkpoint(journal);
1331  spin_lock(&journal->j_list_lock);
1332  }
1333 
1334  J_ASSERT(journal->j_running_transaction == NULL);
1335  J_ASSERT(journal->j_committing_transaction == NULL);
1336  J_ASSERT(journal->j_checkpoint_transactions == NULL);
1337  spin_unlock(&journal->j_list_lock);
1338 
1339  if (journal->j_sb_buffer) {
1340  if (!is_journal_aborted(journal)) {
1341  journal->j_tail_sequence =
1342  ++journal->j_transaction_sequence;
1343  mark_journal_empty(journal);
1344  } else
1345  err = -EIO;
1346  brelse(journal->j_sb_buffer);
1347  }
1348  mutex_unlock(&journal->j_checkpoint_mutex);
1349 
1350  if (journal->j_inode)
1351  iput(journal->j_inode);
1352  if (journal->j_revoke)
1353  journal_destroy_revoke(journal);
1354  kfree(journal->j_wbuf);
1355  kfree(journal);
1356 
1357  return err;
1358 }
1359 
1360 
1372 int journal_check_used_features (journal_t *journal, unsigned long compat,
1373  unsigned long ro, unsigned long incompat)
1374 {
1376 
1377  if (!compat && !ro && !incompat)
1378  return 1;
1379  if (journal->j_format_version == 1)
1380  return 0;
1381 
1382  sb = journal->j_superblock;
1383 
1384  if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1385  ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1386  ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1387  return 1;
1388 
1389  return 0;
1390 }
1391 
1403 int journal_check_available_features (journal_t *journal, unsigned long compat,
1404  unsigned long ro, unsigned long incompat)
1405 {
1406  if (!compat && !ro && !incompat)
1407  return 1;
1408 
1409  /* We can support any known requested features iff the
1410  * superblock is in version 2. Otherwise we fail to support any
1411  * extended sb features. */
1412 
1413  if (journal->j_format_version != 2)
1414  return 0;
1415 
1416  if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
1417  (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
1418  (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
1419  return 1;
1420 
1421  return 0;
1422 }
1423 
1436 int journal_set_features (journal_t *journal, unsigned long compat,
1437  unsigned long ro, unsigned long incompat)
1438 {
1440 
1441  if (journal_check_used_features(journal, compat, ro, incompat))
1442  return 1;
1443 
1444  if (!journal_check_available_features(journal, compat, ro, incompat))
1445  return 0;
1446 
1447  jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1448  compat, ro, incompat);
1449 
1450  sb = journal->j_superblock;
1451 
1452  sb->s_feature_compat |= cpu_to_be32(compat);
1453  sb->s_feature_ro_compat |= cpu_to_be32(ro);
1454  sb->s_feature_incompat |= cpu_to_be32(incompat);
1455 
1456  return 1;
1457 }
1458 
1459 
1467 int journal_update_format (journal_t *journal)
1468 {
1470  int err;
1471 
1472  err = journal_get_superblock(journal);
1473  if (err)
1474  return err;
1475 
1476  sb = journal->j_superblock;
1477 
1478  switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1479  case JFS_SUPERBLOCK_V2:
1480  return 0;
1481  case JFS_SUPERBLOCK_V1:
1482  return journal_convert_superblock_v1(journal, sb);
1483  default:
1484  break;
1485  }
1486  return -EINVAL;
1487 }
1488 
1489 static int journal_convert_superblock_v1(journal_t *journal,
1491 {
1492  int offset, blocksize;
1493  struct buffer_head *bh;
1494 
1496  "JBD: Converting superblock from version 1 to 2.\n");
1497 
1498  /* Pre-initialise new fields to zero */
1499  offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1500  blocksize = be32_to_cpu(sb->s_blocksize);
1501  memset(&sb->s_feature_compat, 0, blocksize-offset);
1502 
1503  sb->s_nr_users = cpu_to_be32(1);
1505  journal->j_format_version = 2;
1506 
1507  bh = journal->j_sb_buffer;
1508  BUFFER_TRACE(bh, "marking dirty");
1509  mark_buffer_dirty(bh);
1510  sync_dirty_buffer(bh);
1511  return 0;
1512 }
1513 
1514 
1524 int journal_flush(journal_t *journal)
1525 {
1526  int err = 0;
1527  transaction_t *transaction = NULL;
1528 
1529  spin_lock(&journal->j_state_lock);
1530 
1531  /* Force everything buffered to the log... */
1532  if (journal->j_running_transaction) {
1533  transaction = journal->j_running_transaction;
1534  __log_start_commit(journal, transaction->t_tid);
1535  } else if (journal->j_committing_transaction)
1536  transaction = journal->j_committing_transaction;
1537 
1538  /* Wait for the log commit to complete... */
1539  if (transaction) {
1540  tid_t tid = transaction->t_tid;
1541 
1542  spin_unlock(&journal->j_state_lock);
1543  log_wait_commit(journal, tid);
1544  } else {
1545  spin_unlock(&journal->j_state_lock);
1546  }
1547 
1548  /* ...and flush everything in the log out to disk. */
1549  spin_lock(&journal->j_list_lock);
1550  while (!err && journal->j_checkpoint_transactions != NULL) {
1551  spin_unlock(&journal->j_list_lock);
1552  mutex_lock(&journal->j_checkpoint_mutex);
1553  err = log_do_checkpoint(journal);
1554  mutex_unlock(&journal->j_checkpoint_mutex);
1555  spin_lock(&journal->j_list_lock);
1556  }
1557  spin_unlock(&journal->j_list_lock);
1558 
1559  if (is_journal_aborted(journal))
1560  return -EIO;
1561 
1562  mutex_lock(&journal->j_checkpoint_mutex);
1563  cleanup_journal_tail(journal);
1564 
1565  /* Finally, mark the journal as really needing no recovery.
1566  * This sets s_start==0 in the underlying superblock, which is
1567  * the magic code for a fully-recovered superblock. Any future
1568  * commits of data to the journal will restore the current
1569  * s_start value. */
1570  mark_journal_empty(journal);
1571  mutex_unlock(&journal->j_checkpoint_mutex);
1572  spin_lock(&journal->j_state_lock);
1573  J_ASSERT(!journal->j_running_transaction);
1574  J_ASSERT(!journal->j_committing_transaction);
1575  J_ASSERT(!journal->j_checkpoint_transactions);
1576  J_ASSERT(journal->j_head == journal->j_tail);
1577  J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1578  spin_unlock(&journal->j_state_lock);
1579  return 0;
1580 }
1581 
1595 int journal_wipe(journal_t *journal, int write)
1596 {
1597  int err = 0;
1598 
1599  J_ASSERT (!(journal->j_flags & JFS_LOADED));
1600 
1601  err = load_superblock(journal);
1602  if (err)
1603  return err;
1604 
1605  if (!journal->j_tail)
1606  goto no_recovery;
1607 
1608  printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1609  write ? "Clearing" : "Ignoring");
1610 
1611  err = journal_skip_recovery(journal);
1612  if (write) {
1613  /* Lock to make assertions happy... */
1614  mutex_lock(&journal->j_checkpoint_mutex);
1615  mark_journal_empty(journal);
1616  mutex_unlock(&journal->j_checkpoint_mutex);
1617  }
1618 
1619  no_recovery:
1620  return err;
1621 }
1622 
1623 /*
1624  * journal_dev_name: format a character string to describe on what
1625  * device this journal is present.
1626  */
1627 
1628 static const char *journal_dev_name(journal_t *journal, char *buffer)
1629 {
1630  struct block_device *bdev;
1631 
1632  if (journal->j_inode)
1633  bdev = journal->j_inode->i_sb->s_bdev;
1634  else
1635  bdev = journal->j_dev;
1636 
1637  return bdevname(bdev, buffer);
1638 }
1639 
1640 /*
1641  * Journal abort has very specific semantics, which we describe
1642  * for journal abort.
1643  *
1644  * Two internal function, which provide abort to te jbd layer
1645  * itself are here.
1646  */
1647 
1648 /*
1649  * Quick version for internal journal use (doesn't lock the journal).
1650  * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1651  * and don't attempt to make any other journal updates.
1652  */
1653 static void __journal_abort_hard(journal_t *journal)
1654 {
1656  char b[BDEVNAME_SIZE];
1657 
1658  if (journal->j_flags & JFS_ABORT)
1659  return;
1660 
1661  printk(KERN_ERR "Aborting journal on device %s.\n",
1662  journal_dev_name(journal, b));
1663 
1664  spin_lock(&journal->j_state_lock);
1665  journal->j_flags |= JFS_ABORT;
1666  transaction = journal->j_running_transaction;
1667  if (transaction)
1668  __log_start_commit(journal, transaction->t_tid);
1669  spin_unlock(&journal->j_state_lock);
1670 }
1671 
1672 /* Soft abort: record the abort error status in the journal superblock,
1673  * but don't do any other IO. */
1674 static void __journal_abort_soft (journal_t *journal, int errno)
1675 {
1676  if (journal->j_flags & JFS_ABORT)
1677  return;
1678 
1679  if (!journal->j_errno)
1680  journal->j_errno = errno;
1681 
1682  __journal_abort_hard(journal);
1683 
1684  if (errno)
1685  journal_update_sb_errno(journal);
1686 }
1687 
1734 void journal_abort(journal_t *journal, int errno)
1735 {
1736  __journal_abort_soft(journal, errno);
1737 }
1738 
1750 int journal_errno(journal_t *journal)
1751 {
1752  int err;
1753 
1754  spin_lock(&journal->j_state_lock);
1755  if (journal->j_flags & JFS_ABORT)
1756  err = -EROFS;
1757  else
1758  err = journal->j_errno;
1759  spin_unlock(&journal->j_state_lock);
1760  return err;
1761 }
1762 
1770 int journal_clear_err(journal_t *journal)
1771 {
1772  int err = 0;
1773 
1774  spin_lock(&journal->j_state_lock);
1775  if (journal->j_flags & JFS_ABORT)
1776  err = -EROFS;
1777  else
1778  journal->j_errno = 0;
1779  spin_unlock(&journal->j_state_lock);
1780  return err;
1781 }
1782 
1790 void journal_ack_err(journal_t *journal)
1791 {
1792  spin_lock(&journal->j_state_lock);
1793  if (journal->j_errno)
1794  journal->j_flags |= JFS_ACK_ERR;
1795  spin_unlock(&journal->j_state_lock);
1796 }
1797 
1799 {
1800  return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1801 }
1802 
1803 /*
1804  * Journal_head storage management
1805  */
1806 static struct kmem_cache *journal_head_cache;
1807 #ifdef CONFIG_JBD_DEBUG
1808 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1809 #endif
1810 
1811 static int journal_init_journal_head_cache(void)
1812 {
1813  int retval;
1814 
1815  J_ASSERT(journal_head_cache == NULL);
1816  journal_head_cache = kmem_cache_create("journal_head",
1817  sizeof(struct journal_head),
1818  0, /* offset */
1819  SLAB_TEMPORARY, /* flags */
1820  NULL); /* ctor */
1821  retval = 0;
1822  if (!journal_head_cache) {
1823  retval = -ENOMEM;
1824  printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1825  }
1826  return retval;
1827 }
1828 
1829 static void journal_destroy_journal_head_cache(void)
1830 {
1831  if (journal_head_cache) {
1832  kmem_cache_destroy(journal_head_cache);
1833  journal_head_cache = NULL;
1834  }
1835 }
1836 
1837 /*
1838  * journal_head splicing and dicing
1839  */
1840 static struct journal_head *journal_alloc_journal_head(void)
1841 {
1842  struct journal_head *ret;
1843 
1844 #ifdef CONFIG_JBD_DEBUG
1845  atomic_inc(&nr_journal_heads);
1846 #endif
1847  ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1848  if (ret == NULL) {
1849  jbd_debug(1, "out of memory for journal_head\n");
1850  printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1851  __func__);
1852 
1853  while (ret == NULL) {
1854  yield();
1855  ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1856  }
1857  }
1858  return ret;
1859 }
1860 
1861 static void journal_free_journal_head(struct journal_head *jh)
1862 {
1863 #ifdef CONFIG_JBD_DEBUG
1864  atomic_dec(&nr_journal_heads);
1865  memset(jh, JBD_POISON_FREE, sizeof(*jh));
1866 #endif
1867  kmem_cache_free(journal_head_cache, jh);
1868 }
1869 
1870 /*
1871  * A journal_head is attached to a buffer_head whenever JBD has an
1872  * interest in the buffer.
1873  *
1874  * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1875  * is set. This bit is tested in core kernel code where we need to take
1876  * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1877  * there.
1878  *
1879  * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1880  *
1881  * When a buffer has its BH_JBD bit set it is immune from being released by
1882  * core kernel code, mainly via ->b_count.
1883  *
1884  * A journal_head is detached from its buffer_head when the journal_head's
1885  * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
1886  * transaction (b_cp_transaction) hold their references to b_jcount.
1887  *
1888  * Various places in the kernel want to attach a journal_head to a buffer_head
1889  * _before_ attaching the journal_head to a transaction. To protect the
1890  * journal_head in this situation, journal_add_journal_head elevates the
1891  * journal_head's b_jcount refcount by one. The caller must call
1892  * journal_put_journal_head() to undo this.
1893  *
1894  * So the typical usage would be:
1895  *
1896  * (Attach a journal_head if needed. Increments b_jcount)
1897  * struct journal_head *jh = journal_add_journal_head(bh);
1898  * ...
1899  * (Get another reference for transaction)
1900  * journal_grab_journal_head(bh);
1901  * jh->b_transaction = xxx;
1902  * (Put original reference)
1903  * journal_put_journal_head(jh);
1904  */
1905 
1906 /*
1907  * Give a buffer_head a journal_head.
1908  *
1909  * May sleep.
1910  */
1911 struct journal_head *journal_add_journal_head(struct buffer_head *bh)
1912 {
1913  struct journal_head *jh;
1914  struct journal_head *new_jh = NULL;
1915 
1916 repeat:
1917  if (!buffer_jbd(bh)) {
1918  new_jh = journal_alloc_journal_head();
1919  memset(new_jh, 0, sizeof(*new_jh));
1920  }
1921 
1922  jbd_lock_bh_journal_head(bh);
1923  if (buffer_jbd(bh)) {
1924  jh = bh2jh(bh);
1925  } else {
1926  J_ASSERT_BH(bh,
1927  (atomic_read(&bh->b_count) > 0) ||
1928  (bh->b_page && bh->b_page->mapping));
1929 
1930  if (!new_jh) {
1931  jbd_unlock_bh_journal_head(bh);
1932  goto repeat;
1933  }
1934 
1935  jh = new_jh;
1936  new_jh = NULL; /* We consumed it */
1937  set_buffer_jbd(bh);
1938  bh->b_private = jh;
1939  jh->b_bh = bh;
1940  get_bh(bh);
1941  BUFFER_TRACE(bh, "added journal_head");
1942  }
1943  jh->b_jcount++;
1944  jbd_unlock_bh_journal_head(bh);
1945  if (new_jh)
1946  journal_free_journal_head(new_jh);
1947  return bh->b_private;
1948 }
1949 
1950 /*
1951  * Grab a ref against this buffer_head's journal_head. If it ended up not
1952  * having a journal_head, return NULL
1953  */
1954 struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
1955 {
1956  struct journal_head *jh = NULL;
1957 
1958  jbd_lock_bh_journal_head(bh);
1959  if (buffer_jbd(bh)) {
1960  jh = bh2jh(bh);
1961  jh->b_jcount++;
1962  }
1963  jbd_unlock_bh_journal_head(bh);
1964  return jh;
1965 }
1966 
1967 static void __journal_remove_journal_head(struct buffer_head *bh)
1968 {
1969  struct journal_head *jh = bh2jh(bh);
1970 
1971  J_ASSERT_JH(jh, jh->b_jcount >= 0);
1972  J_ASSERT_JH(jh, jh->b_transaction == NULL);
1973  J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1974  J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
1975  J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1976  J_ASSERT_BH(bh, buffer_jbd(bh));
1977  J_ASSERT_BH(bh, jh2bh(jh) == bh);
1978  BUFFER_TRACE(bh, "remove journal_head");
1979  if (jh->b_frozen_data) {
1980  printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
1981  jbd_free(jh->b_frozen_data, bh->b_size);
1982  }
1983  if (jh->b_committed_data) {
1984  printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
1985  jbd_free(jh->b_committed_data, bh->b_size);
1986  }
1987  bh->b_private = NULL;
1988  jh->b_bh = NULL; /* debug, really */
1989  clear_buffer_jbd(bh);
1990  journal_free_journal_head(jh);
1991 }
1992 
1993 /*
1994  * Drop a reference on the passed journal_head. If it fell to zero then
1995  * release the journal_head from the buffer_head.
1996  */
1998 {
1999  struct buffer_head *bh = jh2bh(jh);
2000 
2001  jbd_lock_bh_journal_head(bh);
2002  J_ASSERT_JH(jh, jh->b_jcount > 0);
2003  --jh->b_jcount;
2004  if (!jh->b_jcount) {
2005  __journal_remove_journal_head(bh);
2006  jbd_unlock_bh_journal_head(bh);
2007  __brelse(bh);
2008  } else
2009  jbd_unlock_bh_journal_head(bh);
2010 }
2011 
2012 /*
2013  * debugfs tunables
2014  */
2015 #ifdef CONFIG_JBD_DEBUG
2016 
2017 u8 journal_enable_debug __read_mostly;
2018 EXPORT_SYMBOL(journal_enable_debug);
2019 
2020 static struct dentry *jbd_debugfs_dir;
2021 static struct dentry *jbd_debug;
2022 
2023 static void __init jbd_create_debugfs_entry(void)
2024 {
2025  jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
2026  if (jbd_debugfs_dir)
2027  jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
2028  jbd_debugfs_dir,
2029  &journal_enable_debug);
2030 }
2031 
2032 static void __exit jbd_remove_debugfs_entry(void)
2033 {
2034  debugfs_remove(jbd_debug);
2035  debugfs_remove(jbd_debugfs_dir);
2036 }
2037 
2038 #else
2039 
2040 static inline void jbd_create_debugfs_entry(void)
2041 {
2042 }
2043 
2044 static inline void jbd_remove_debugfs_entry(void)
2045 {
2046 }
2047 
2048 #endif
2049 
2051 
2052 static int __init journal_init_handle_cache(void)
2053 {
2054  jbd_handle_cache = kmem_cache_create("journal_handle",
2055  sizeof(handle_t),
2056  0, /* offset */
2057  SLAB_TEMPORARY, /* flags */
2058  NULL); /* ctor */
2059  if (jbd_handle_cache == NULL) {
2060  printk(KERN_EMERG "JBD: failed to create handle cache\n");
2061  return -ENOMEM;
2062  }
2063  return 0;
2064 }
2065 
2066 static void journal_destroy_handle_cache(void)
2067 {
2068  if (jbd_handle_cache)
2069  kmem_cache_destroy(jbd_handle_cache);
2070 }
2071 
2072 /*
2073  * Module startup and shutdown
2074  */
2075 
2076 static int __init journal_init_caches(void)
2077 {
2078  int ret;
2079 
2081  if (ret == 0)
2082  ret = journal_init_journal_head_cache();
2083  if (ret == 0)
2084  ret = journal_init_handle_cache();
2085  return ret;
2086 }
2087 
2088 static void journal_destroy_caches(void)
2089 {
2091  journal_destroy_journal_head_cache();
2092  journal_destroy_handle_cache();
2093 }
2094 
2095 static int __init journal_init(void)
2096 {
2097  int ret;
2098 
2099  BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2100 
2101  ret = journal_init_caches();
2102  if (ret != 0)
2103  journal_destroy_caches();
2104  jbd_create_debugfs_entry();
2105  return ret;
2106 }
2107 
2108 static void __exit journal_exit(void)
2109 {
2110 #ifdef CONFIG_JBD_DEBUG
2111  int n = atomic_read(&nr_journal_heads);
2112  if (n)
2113  printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2114 #endif
2115  jbd_remove_debugfs_entry();
2116  journal_destroy_caches();
2117 }
2118 
2119 MODULE_LICENSE("GPL");
2120 module_init(journal_init);
2121 module_exit(journal_exit);
2122