Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
transaction.c
Go to the documentation of this file.
1 /*
2  * linux/fs/jbd/transaction.c
3  *
4  * Written by Stephen C. Tweedie <[email protected]>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Generic filesystem transaction handling code; part of the ext2fs
13  * journaling system.
14  *
15  * This file manages transactions (compound commits managed by the
16  * journaling code) and handles (individual atomic operations by the
17  * filesystem).
18  */
19 
20 #include <linux/time.h>
21 #include <linux/fs.h>
22 #include <linux/jbd.h>
23 #include <linux/errno.h>
24 #include <linux/slab.h>
25 #include <linux/timer.h>
26 #include <linux/mm.h>
27 #include <linux/highmem.h>
28 #include <linux/hrtimer.h>
29 #include <linux/backing-dev.h>
30 
31 static void __journal_temp_unlink_buffer(struct journal_head *jh);
32 
33 /*
34  * get_transaction: obtain a new transaction_t object.
35  *
36  * Simply allocate and initialise a new transaction. Create it in
37  * RUNNING state and add it to the current journal (which should not
38  * have an existing running transaction: we only make a new transaction
39  * once we have started to commit the old one).
40  *
41  * Preconditions:
42  * The journal MUST be locked. We don't perform atomic mallocs on the
43  * new transaction and we can't block without protecting against other
44  * processes trying to touch the journal while it is in transition.
45  *
46  * Called under j_state_lock
47  */
48 
49 static transaction_t *
50 get_transaction(journal_t *journal, transaction_t *transaction)
51 {
52  transaction->t_journal = journal;
53  transaction->t_state = T_RUNNING;
54  transaction->t_start_time = ktime_get();
55  transaction->t_tid = journal->j_transaction_sequence++;
56  transaction->t_expires = jiffies + journal->j_commit_interval;
57  spin_lock_init(&transaction->t_handle_lock);
58 
59  /* Set up the commit timer for the new transaction. */
60  journal->j_commit_timer.expires =
61  round_jiffies_up(transaction->t_expires);
62  add_timer(&journal->j_commit_timer);
63 
64  J_ASSERT(journal->j_running_transaction == NULL);
65  journal->j_running_transaction = transaction;
66 
67  return transaction;
68 }
69 
70 /*
71  * Handle management.
72  *
73  * A handle_t is an object which represents a single atomic update to a
74  * filesystem, and which tracks all of the modifications which form part
75  * of that one update.
76  */
77 
78 /*
79  * start_this_handle: Given a handle, deal with any locking or stalling
80  * needed to make sure that there is enough journal space for the handle
81  * to begin. Attach the handle to a transaction and set up the
82  * transaction's buffer credits.
83  */
84 
85 static int start_this_handle(journal_t *journal, handle_t *handle)
86 {
88  int needed;
89  int nblocks = handle->h_buffer_credits;
90  transaction_t *new_transaction = NULL;
91  int ret = 0;
92 
93  if (nblocks > journal->j_max_transaction_buffers) {
94  printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
95  current->comm, nblocks,
96  journal->j_max_transaction_buffers);
97  ret = -ENOSPC;
98  goto out;
99  }
100 
101 alloc_transaction:
102  if (!journal->j_running_transaction) {
103  new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
104  if (!new_transaction) {
106  goto alloc_transaction;
107  }
108  }
109 
110  jbd_debug(3, "New handle %p going live.\n", handle);
111 
112 repeat:
113 
114  /*
115  * We need to hold j_state_lock until t_updates has been incremented,
116  * for proper journal barrier handling
117  */
118  spin_lock(&journal->j_state_lock);
119 repeat_locked:
120  if (is_journal_aborted(journal) ||
121  (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
122  spin_unlock(&journal->j_state_lock);
123  ret = -EROFS;
124  goto out;
125  }
126 
127  /* Wait on the journal's transaction barrier if necessary */
128  if (journal->j_barrier_count) {
129  spin_unlock(&journal->j_state_lock);
130  wait_event(journal->j_wait_transaction_locked,
131  journal->j_barrier_count == 0);
132  goto repeat;
133  }
134 
135  if (!journal->j_running_transaction) {
136  if (!new_transaction) {
137  spin_unlock(&journal->j_state_lock);
138  goto alloc_transaction;
139  }
140  get_transaction(journal, new_transaction);
141  new_transaction = NULL;
142  }
143 
144  transaction = journal->j_running_transaction;
145 
146  /*
147  * If the current transaction is locked down for commit, wait for the
148  * lock to be released.
149  */
150  if (transaction->t_state == T_LOCKED) {
151  DEFINE_WAIT(wait);
152 
153  prepare_to_wait(&journal->j_wait_transaction_locked,
155  spin_unlock(&journal->j_state_lock);
156  schedule();
157  finish_wait(&journal->j_wait_transaction_locked, &wait);
158  goto repeat;
159  }
160 
161  /*
162  * If there is not enough space left in the log to write all potential
163  * buffers requested by this operation, we need to stall pending a log
164  * checkpoint to free some more log space.
165  */
166  spin_lock(&transaction->t_handle_lock);
167  needed = transaction->t_outstanding_credits + nblocks;
168 
169  if (needed > journal->j_max_transaction_buffers) {
170  /*
171  * If the current transaction is already too large, then start
172  * to commit it: we can then go back and attach this handle to
173  * a new transaction.
174  */
175  DEFINE_WAIT(wait);
176 
177  jbd_debug(2, "Handle %p starting new commit...\n", handle);
178  spin_unlock(&transaction->t_handle_lock);
179  prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
181  __log_start_commit(journal, transaction->t_tid);
182  spin_unlock(&journal->j_state_lock);
183  schedule();
184  finish_wait(&journal->j_wait_transaction_locked, &wait);
185  goto repeat;
186  }
187 
188  /*
189  * The commit code assumes that it can get enough log space
190  * without forcing a checkpoint. This is *critical* for
191  * correctness: a checkpoint of a buffer which is also
192  * associated with a committing transaction creates a deadlock,
193  * so commit simply cannot force through checkpoints.
194  *
195  * We must therefore ensure the necessary space in the journal
196  * *before* starting to dirty potentially checkpointed buffers
197  * in the new transaction.
198  *
199  * The worst part is, any transaction currently committing can
200  * reduce the free space arbitrarily. Be careful to account for
201  * those buffers when checkpointing.
202  */
203 
204  /*
205  * @@@ AKPM: This seems rather over-defensive. We're giving commit
206  * a _lot_ of headroom: 1/4 of the journal plus the size of
207  * the committing transaction. Really, we only need to give it
208  * committing_transaction->t_outstanding_credits plus "enough" for
209  * the log control blocks.
210  * Also, this test is inconsistent with the matching one in
211  * journal_extend().
212  */
213  if (__log_space_left(journal) < jbd_space_needed(journal)) {
214  jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
215  spin_unlock(&transaction->t_handle_lock);
216  __log_wait_for_space(journal);
217  goto repeat_locked;
218  }
219 
220  /* OK, account for the buffers that this operation expects to
221  * use and add the handle to the running transaction. */
222 
223  handle->h_transaction = transaction;
224  transaction->t_outstanding_credits += nblocks;
225  transaction->t_updates++;
226  transaction->t_handle_count++;
227  jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
228  handle, nblocks, transaction->t_outstanding_credits,
229  __log_space_left(journal));
230  spin_unlock(&transaction->t_handle_lock);
231  spin_unlock(&journal->j_state_lock);
232 
233  lock_map_acquire(&handle->h_lockdep_map);
234 out:
235  if (unlikely(new_transaction)) /* It's usually NULL */
236  kfree(new_transaction);
237  return ret;
238 }
239 
240 static struct lock_class_key jbd_handle_key;
241 
242 /* Allocate a new handle. This should probably be in a slab... */
243 static handle_t *new_handle(int nblocks)
244 {
245  handle_t *handle = jbd_alloc_handle(GFP_NOFS);
246  if (!handle)
247  return NULL;
248  memset(handle, 0, sizeof(*handle));
249  handle->h_buffer_credits = nblocks;
250  handle->h_ref = 1;
251 
252  lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
253 
254  return handle;
255 }
256 
272 handle_t *journal_start(journal_t *journal, int nblocks)
273 {
274  handle_t *handle = journal_current_handle();
275  int err;
276 
277  if (!journal)
278  return ERR_PTR(-EROFS);
279 
280  if (handle) {
281  J_ASSERT(handle->h_transaction->t_journal == journal);
282  handle->h_ref++;
283  return handle;
284  }
285 
286  handle = new_handle(nblocks);
287  if (!handle)
288  return ERR_PTR(-ENOMEM);
289 
290  current->journal_info = handle;
291 
292  err = start_this_handle(journal, handle);
293  if (err < 0) {
294  jbd_free_handle(handle);
295  current->journal_info = NULL;
296  handle = ERR_PTR(err);
297  }
298  return handle;
299 }
300 
321 int journal_extend(handle_t *handle, int nblocks)
322 {
323  transaction_t *transaction = handle->h_transaction;
324  journal_t *journal = transaction->t_journal;
325  int result;
326  int wanted;
327 
328  result = -EIO;
329  if (is_handle_aborted(handle))
330  goto out;
331 
332  result = 1;
333 
334  spin_lock(&journal->j_state_lock);
335 
336  /* Don't extend a locked-down transaction! */
337  if (handle->h_transaction->t_state != T_RUNNING) {
338  jbd_debug(3, "denied handle %p %d blocks: "
339  "transaction not running\n", handle, nblocks);
340  goto error_out;
341  }
342 
343  spin_lock(&transaction->t_handle_lock);
344  wanted = transaction->t_outstanding_credits + nblocks;
345 
346  if (wanted > journal->j_max_transaction_buffers) {
347  jbd_debug(3, "denied handle %p %d blocks: "
348  "transaction too large\n", handle, nblocks);
349  goto unlock;
350  }
351 
352  if (wanted > __log_space_left(journal)) {
353  jbd_debug(3, "denied handle %p %d blocks: "
354  "insufficient log space\n", handle, nblocks);
355  goto unlock;
356  }
357 
358  handle->h_buffer_credits += nblocks;
359  transaction->t_outstanding_credits += nblocks;
360  result = 0;
361 
362  jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
363 unlock:
364  spin_unlock(&transaction->t_handle_lock);
365 error_out:
366  spin_unlock(&journal->j_state_lock);
367 out:
368  return result;
369 }
370 
371 
387 int journal_restart(handle_t *handle, int nblocks)
388 {
389  transaction_t *transaction = handle->h_transaction;
390  journal_t *journal = transaction->t_journal;
391  int ret;
392 
393  /* If we've had an abort of any type, don't even think about
394  * actually doing the restart! */
395  if (is_handle_aborted(handle))
396  return 0;
397 
398  /*
399  * First unlink the handle from its current transaction, and start the
400  * commit on that.
401  */
402  J_ASSERT(transaction->t_updates > 0);
403  J_ASSERT(journal_current_handle() == handle);
404 
405  spin_lock(&journal->j_state_lock);
406  spin_lock(&transaction->t_handle_lock);
407  transaction->t_outstanding_credits -= handle->h_buffer_credits;
408  transaction->t_updates--;
409 
410  if (!transaction->t_updates)
411  wake_up(&journal->j_wait_updates);
412  spin_unlock(&transaction->t_handle_lock);
413 
414  jbd_debug(2, "restarting handle %p\n", handle);
415  __log_start_commit(journal, transaction->t_tid);
416  spin_unlock(&journal->j_state_lock);
417 
418  lock_map_release(&handle->h_lockdep_map);
419  handle->h_buffer_credits = nblocks;
420  ret = start_this_handle(journal, handle);
421  return ret;
422 }
423 
424 
439 void journal_lock_updates(journal_t *journal)
440 {
441  DEFINE_WAIT(wait);
442 
443 wait:
444  /* Wait for previous locked operation to finish */
445  wait_event(journal->j_wait_transaction_locked,
446  journal->j_barrier_count == 0);
447 
448  spin_lock(&journal->j_state_lock);
449  /*
450  * Check reliably under the lock whether we are the ones winning the race
451  * and locking the journal
452  */
453  if (journal->j_barrier_count > 0) {
454  spin_unlock(&journal->j_state_lock);
455  goto wait;
456  }
457  ++journal->j_barrier_count;
458 
459  /* Wait until there are no running updates */
460  while (1) {
461  transaction_t *transaction = journal->j_running_transaction;
462 
463  if (!transaction)
464  break;
465 
466  spin_lock(&transaction->t_handle_lock);
467  if (!transaction->t_updates) {
468  spin_unlock(&transaction->t_handle_lock);
469  break;
470  }
471  prepare_to_wait(&journal->j_wait_updates, &wait,
473  spin_unlock(&transaction->t_handle_lock);
474  spin_unlock(&journal->j_state_lock);
475  schedule();
476  finish_wait(&journal->j_wait_updates, &wait);
477  spin_lock(&journal->j_state_lock);
478  }
479  spin_unlock(&journal->j_state_lock);
480 }
481 
488 void journal_unlock_updates (journal_t *journal)
489 {
490  J_ASSERT(journal->j_barrier_count != 0);
491 
492  spin_lock(&journal->j_state_lock);
493  --journal->j_barrier_count;
494  spin_unlock(&journal->j_state_lock);
495  wake_up(&journal->j_wait_transaction_locked);
496 }
497 
498 static void warn_dirty_buffer(struct buffer_head *bh)
499 {
500  char b[BDEVNAME_SIZE];
501 
503  "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
504  "There's a risk of filesystem corruption in case of system "
505  "crash.\n",
506  bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
507 }
508 
509 /*
510  * If the buffer is already part of the current transaction, then there
511  * is nothing we need to do. If it is already part of a prior
512  * transaction which we are still committing to disk, then we need to
513  * make sure that we do not overwrite the old copy: we do copy-out to
514  * preserve the copy going to disk. We also account the buffer against
515  * the handle's metadata buffer credits (unless the buffer is already
516  * part of the transaction, that is).
517  *
518  */
519 static int
520 do_get_write_access(handle_t *handle, struct journal_head *jh,
521  int force_copy)
522 {
523  struct buffer_head *bh;
525  journal_t *journal;
526  int error;
527  char *frozen_buffer = NULL;
528  int need_copy = 0;
529 
530  if (is_handle_aborted(handle))
531  return -EROFS;
532 
533  transaction = handle->h_transaction;
534  journal = transaction->t_journal;
535 
536  jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
537 
538  JBUFFER_TRACE(jh, "entry");
539 repeat:
540  bh = jh2bh(jh);
541 
542  /* @@@ Need to check for errors here at some point. */
543 
544  lock_buffer(bh);
545  jbd_lock_bh_state(bh);
546 
547  /* We now hold the buffer lock so it is safe to query the buffer
548  * state. Is the buffer dirty?
549  *
550  * If so, there are two possibilities. The buffer may be
551  * non-journaled, and undergoing a quite legitimate writeback.
552  * Otherwise, it is journaled, and we don't expect dirty buffers
553  * in that state (the buffers should be marked JBD_Dirty
554  * instead.) So either the IO is being done under our own
555  * control and this is a bug, or it's a third party IO such as
556  * dump(8) (which may leave the buffer scheduled for read ---
557  * ie. locked but not dirty) or tune2fs (which may actually have
558  * the buffer dirtied, ugh.) */
559 
560  if (buffer_dirty(bh)) {
561  /*
562  * First question: is this buffer already part of the current
563  * transaction or the existing committing transaction?
564  */
565  if (jh->b_transaction) {
566  J_ASSERT_JH(jh,
567  jh->b_transaction == transaction ||
568  jh->b_transaction ==
569  journal->j_committing_transaction);
570  if (jh->b_next_transaction)
571  J_ASSERT_JH(jh, jh->b_next_transaction ==
572  transaction);
573  warn_dirty_buffer(bh);
574  }
575  /*
576  * In any case we need to clean the dirty flag and we must
577  * do it under the buffer lock to be sure we don't race
578  * with running write-out.
579  */
580  JBUFFER_TRACE(jh, "Journalling dirty buffer");
581  clear_buffer_dirty(bh);
582  set_buffer_jbddirty(bh);
583  }
584 
585  unlock_buffer(bh);
586 
587  error = -EROFS;
588  if (is_handle_aborted(handle)) {
589  jbd_unlock_bh_state(bh);
590  goto out;
591  }
592  error = 0;
593 
594  /*
595  * The buffer is already part of this transaction if b_transaction or
596  * b_next_transaction points to it
597  */
598  if (jh->b_transaction == transaction ||
599  jh->b_next_transaction == transaction)
600  goto done;
601 
602  /*
603  * this is the first time this transaction is touching this buffer,
604  * reset the modified flag
605  */
606  jh->b_modified = 0;
607 
608  /*
609  * If there is already a copy-out version of this buffer, then we don't
610  * need to make another one
611  */
612  if (jh->b_frozen_data) {
613  JBUFFER_TRACE(jh, "has frozen data");
614  J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
616  goto done;
617  }
618 
619  /* Is there data here we need to preserve? */
620 
621  if (jh->b_transaction && jh->b_transaction != transaction) {
622  JBUFFER_TRACE(jh, "owned by older transaction");
623  J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
624  J_ASSERT_JH(jh, jh->b_transaction ==
625  journal->j_committing_transaction);
626 
627  /* There is one case we have to be very careful about.
628  * If the committing transaction is currently writing
629  * this buffer out to disk and has NOT made a copy-out,
630  * then we cannot modify the buffer contents at all
631  * right now. The essence of copy-out is that it is the
632  * extra copy, not the primary copy, which gets
633  * journaled. If the primary copy is already going to
634  * disk then we cannot do copy-out here. */
635 
636  if (jh->b_jlist == BJ_Shadow) {
637  DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
638  wait_queue_head_t *wqh;
639 
640  wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
641 
642  JBUFFER_TRACE(jh, "on shadow: sleep");
643  jbd_unlock_bh_state(bh);
644  /* commit wakes up all shadow buffers after IO */
645  for ( ; ; ) {
646  prepare_to_wait(wqh, &wait.wait,
648  if (jh->b_jlist != BJ_Shadow)
649  break;
650  schedule();
651  }
652  finish_wait(wqh, &wait.wait);
653  goto repeat;
654  }
655 
656  /* Only do the copy if the currently-owning transaction
657  * still needs it. If it is on the Forget list, the
658  * committing transaction is past that stage. The
659  * buffer had better remain locked during the kmalloc,
660  * but that should be true --- we hold the journal lock
661  * still and the buffer is already on the BUF_JOURNAL
662  * list so won't be flushed.
663  *
664  * Subtle point, though: if this is a get_undo_access,
665  * then we will be relying on the frozen_data to contain
666  * the new value of the committed_data record after the
667  * transaction, so we HAVE to force the frozen_data copy
668  * in that case. */
669 
670  if (jh->b_jlist != BJ_Forget || force_copy) {
671  JBUFFER_TRACE(jh, "generate frozen data");
672  if (!frozen_buffer) {
673  JBUFFER_TRACE(jh, "allocate memory for buffer");
674  jbd_unlock_bh_state(bh);
675  frozen_buffer =
676  jbd_alloc(jh2bh(jh)->b_size,
677  GFP_NOFS);
678  if (!frozen_buffer) {
680  "%s: OOM for frozen_buffer\n",
681  __func__);
682  JBUFFER_TRACE(jh, "oom!");
683  error = -ENOMEM;
684  jbd_lock_bh_state(bh);
685  goto done;
686  }
687  goto repeat;
688  }
689  jh->b_frozen_data = frozen_buffer;
690  frozen_buffer = NULL;
691  need_copy = 1;
692  }
694  }
695 
696 
697  /*
698  * Finally, if the buffer is not journaled right now, we need to make
699  * sure it doesn't get written to disk before the caller actually
700  * commits the new data
701  */
702  if (!jh->b_transaction) {
703  JBUFFER_TRACE(jh, "no transaction");
704  J_ASSERT_JH(jh, !jh->b_next_transaction);
705  JBUFFER_TRACE(jh, "file as BJ_Reserved");
706  spin_lock(&journal->j_list_lock);
707  __journal_file_buffer(jh, transaction, BJ_Reserved);
708  spin_unlock(&journal->j_list_lock);
709  }
710 
711 done:
712  if (need_copy) {
713  struct page *page;
714  int offset;
715  char *source;
716 
717  J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
718  "Possible IO failure.\n");
719  page = jh2bh(jh)->b_page;
720  offset = offset_in_page(jh2bh(jh)->b_data);
721  source = kmap_atomic(page);
722  memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
723  kunmap_atomic(source);
724  }
725  jbd_unlock_bh_state(bh);
726 
727  /*
728  * If we are about to journal a buffer, then any revoke pending on it is
729  * no longer valid
730  */
731  journal_cancel_revoke(handle, jh);
732 
733 out:
734  if (unlikely(frozen_buffer)) /* It's usually NULL */
735  jbd_free(frozen_buffer, bh->b_size);
736 
737  JBUFFER_TRACE(jh, "exit");
738  return error;
739 }
740 
752 int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
753 {
754  struct journal_head *jh = journal_add_journal_head(bh);
755  int rc;
756 
757  /* We do not want to get caught playing with fields which the
758  * log thread also manipulates. Make sure that the buffer
759  * completes any outstanding IO before proceeding. */
760  rc = do_get_write_access(handle, jh, 0);
762  return rc;
763 }
764 
765 
766 /*
767  * When the user wants to journal a newly created buffer_head
768  * (ie. getblk() returned a new buffer and we are going to populate it
769  * manually rather than reading off disk), then we need to keep the
770  * buffer_head locked until it has been completely filled with new
771  * data. In this case, we should be able to make the assertion that
772  * the bh is not already part of an existing transaction.
773  *
774  * The buffer should already be locked by the caller by this point.
775  * There is no lock ranking violation: it was a newly created,
776  * unlocked buffer beforehand. */
777 
785 int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
786 {
787  transaction_t *transaction = handle->h_transaction;
788  journal_t *journal = transaction->t_journal;
789  struct journal_head *jh = journal_add_journal_head(bh);
790  int err;
791 
792  jbd_debug(5, "journal_head %p\n", jh);
793  err = -EROFS;
794  if (is_handle_aborted(handle))
795  goto out;
796  err = 0;
797 
798  JBUFFER_TRACE(jh, "entry");
799  /*
800  * The buffer may already belong to this transaction due to pre-zeroing
801  * in the filesystem's new_block code. It may also be on the previous,
802  * committing transaction's lists, but it HAS to be in Forget state in
803  * that case: the transaction must have deleted the buffer for it to be
804  * reused here.
805  */
806  jbd_lock_bh_state(bh);
807  spin_lock(&journal->j_list_lock);
808  J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
809  jh->b_transaction == NULL ||
810  (jh->b_transaction == journal->j_committing_transaction &&
811  jh->b_jlist == BJ_Forget)));
812 
813  J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
814  J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
815 
816  if (jh->b_transaction == NULL) {
817  /*
818  * Previous journal_forget() could have left the buffer
819  * with jbddirty bit set because it was being committed. When
820  * the commit finished, we've filed the buffer for
821  * checkpointing and marked it dirty. Now we are reallocating
822  * the buffer so the transaction freeing it must have
823  * committed and so it's safe to clear the dirty bit.
824  */
825  clear_buffer_dirty(jh2bh(jh));
826 
827  /* first access by this transaction */
828  jh->b_modified = 0;
829 
830  JBUFFER_TRACE(jh, "file as BJ_Reserved");
831  __journal_file_buffer(jh, transaction, BJ_Reserved);
832  } else if (jh->b_transaction == journal->j_committing_transaction) {
833  /* first access by this transaction */
834  jh->b_modified = 0;
835 
836  JBUFFER_TRACE(jh, "set next transaction");
838  }
839  spin_unlock(&journal->j_list_lock);
840  jbd_unlock_bh_state(bh);
841 
842  /*
843  * akpm: I added this. ext3_alloc_branch can pick up new indirect
844  * blocks which contain freed but then revoked metadata. We need
845  * to cancel the revoke in case we end up freeing it yet again
846  * and the reallocating as data - this would cause a second revoke,
847  * which hits an assertion error.
848  */
849  JBUFFER_TRACE(jh, "cancelling revoke");
850  journal_cancel_revoke(handle, jh);
851 out:
853  return err;
854 }
855 
881 int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
882 {
883  int err;
884  struct journal_head *jh = journal_add_journal_head(bh);
885  char *committed_data = NULL;
886 
887  JBUFFER_TRACE(jh, "entry");
888 
889  /*
890  * Do this first --- it can drop the journal lock, so we want to
891  * make sure that obtaining the committed_data is done
892  * atomically wrt. completion of any outstanding commits.
893  */
894  err = do_get_write_access(handle, jh, 1);
895  if (err)
896  goto out;
897 
898 repeat:
899  if (!jh->b_committed_data) {
900  committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
901  if (!committed_data) {
902  printk(KERN_EMERG "%s: No memory for committed data\n",
903  __func__);
904  err = -ENOMEM;
905  goto out;
906  }
907  }
908 
909  jbd_lock_bh_state(bh);
910  if (!jh->b_committed_data) {
911  /* Copy out the current buffer contents into the
912  * preserved, committed copy. */
913  JBUFFER_TRACE(jh, "generate b_committed data");
914  if (!committed_data) {
915  jbd_unlock_bh_state(bh);
916  goto repeat;
917  }
918 
919  jh->b_committed_data = committed_data;
920  committed_data = NULL;
921  memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
922  }
923  jbd_unlock_bh_state(bh);
924 out:
926  if (unlikely(committed_data))
927  jbd_free(committed_data, bh->b_size);
928  return err;
929 }
930 
948 int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
949 {
950  journal_t *journal = handle->h_transaction->t_journal;
951  int need_brelse = 0;
952  struct journal_head *jh;
953  int ret = 0;
954 
955  if (is_handle_aborted(handle))
956  return ret;
957 
958  jh = journal_add_journal_head(bh);
959  JBUFFER_TRACE(jh, "entry");
960 
961  /*
962  * The buffer could *already* be dirty. Writeout can start
963  * at any time.
964  */
965  jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
966 
967  /*
968  * What if the buffer is already part of a running transaction?
969  *
970  * There are two cases:
971  * 1) It is part of the current running transaction. Refile it,
972  * just in case we have allocated it as metadata, deallocated
973  * it, then reallocated it as data.
974  * 2) It is part of the previous, still-committing transaction.
975  * If all we want to do is to guarantee that the buffer will be
976  * written to disk before this new transaction commits, then
977  * being sure that the *previous* transaction has this same
978  * property is sufficient for us! Just leave it on its old
979  * transaction.
980  *
981  * In case (2), the buffer must not already exist as metadata
982  * --- that would violate write ordering (a transaction is free
983  * to write its data at any point, even before the previous
984  * committing transaction has committed). The caller must
985  * never, ever allow this to happen: there's nothing we can do
986  * about it in this layer.
987  */
988  jbd_lock_bh_state(bh);
989  spin_lock(&journal->j_list_lock);
990 
991  /* Now that we have bh_state locked, are we really still mapped? */
992  if (!buffer_mapped(bh)) {
993  JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
994  goto no_journal;
995  }
996 
997  if (jh->b_transaction) {
998  JBUFFER_TRACE(jh, "has transaction");
999  if (jh->b_transaction != handle->h_transaction) {
1000  JBUFFER_TRACE(jh, "belongs to older transaction");
1001  J_ASSERT_JH(jh, jh->b_transaction ==
1002  journal->j_committing_transaction);
1003 
1004  /* @@@ IS THIS TRUE ? */
1005  /*
1006  * Not any more. Scenario: someone does a write()
1007  * in data=journal mode. The buffer's transaction has
1008  * moved into commit. Then someone does another
1009  * write() to the file. We do the frozen data copyout
1010  * and set b_next_transaction to point to j_running_t.
1011  * And while we're in that state, someone does a
1012  * writepage() in an attempt to pageout the same area
1013  * of the file via a shared mapping. At present that
1014  * calls journal_dirty_data(), and we get right here.
1015  * It may be too late to journal the data. Simply
1016  * falling through to the next test will suffice: the
1017  * data will be dirty and wil be checkpointed. The
1018  * ordering comments in the next comment block still
1019  * apply.
1020  */
1021  //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1022 
1023  /*
1024  * If we're journalling data, and this buffer was
1025  * subject to a write(), it could be metadata, forget
1026  * or shadow against the committing transaction. Now,
1027  * someone has dirtied the same darn page via a mapping
1028  * and it is being writepage()'d.
1029  * We *could* just steal the page from commit, with some
1030  * fancy locking there. Instead, we just skip it -
1031  * don't tie the page's buffers to the new transaction
1032  * at all.
1033  * Implication: if we crash before the writepage() data
1034  * is written into the filesystem, recovery will replay
1035  * the write() data.
1036  */
1037  if (jh->b_jlist != BJ_None &&
1038  jh->b_jlist != BJ_SyncData &&
1039  jh->b_jlist != BJ_Locked) {
1040  JBUFFER_TRACE(jh, "Not stealing");
1041  goto no_journal;
1042  }
1043 
1044  /*
1045  * This buffer may be undergoing writeout in commit. We
1046  * can't return from here and let the caller dirty it
1047  * again because that can cause the write-out loop in
1048  * commit to never terminate.
1049  */
1050  if (buffer_dirty(bh)) {
1051  get_bh(bh);
1052  spin_unlock(&journal->j_list_lock);
1053  jbd_unlock_bh_state(bh);
1054  need_brelse = 1;
1055  sync_dirty_buffer(bh);
1056  jbd_lock_bh_state(bh);
1057  spin_lock(&journal->j_list_lock);
1058  /* Since we dropped the lock... */
1059  if (!buffer_mapped(bh)) {
1060  JBUFFER_TRACE(jh, "buffer got unmapped");
1061  goto no_journal;
1062  }
1063  /* The buffer may become locked again at any
1064  time if it is redirtied */
1065  }
1066 
1067  /*
1068  * We cannot remove the buffer with io error from the
1069  * committing transaction, because otherwise it would
1070  * miss the error and the commit would not abort.
1071  */
1072  if (unlikely(!buffer_uptodate(bh))) {
1073  ret = -EIO;
1074  goto no_journal;
1075  }
1076  /* We might have slept so buffer could be refiled now */
1077  if (jh->b_transaction != NULL &&
1078  jh->b_transaction != handle->h_transaction) {
1079  JBUFFER_TRACE(jh, "unfile from commit");
1080  __journal_temp_unlink_buffer(jh);
1081  /* It still points to the committing
1082  * transaction; move it to this one so
1083  * that the refile assert checks are
1084  * happy. */
1085  jh->b_transaction = handle->h_transaction;
1086  }
1087  /* The buffer will be refiled below */
1088 
1089  }
1090  /*
1091  * Special case --- the buffer might actually have been
1092  * allocated and then immediately deallocated in the previous,
1093  * committing transaction, so might still be left on that
1094  * transaction's metadata lists.
1095  */
1096  if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1097  JBUFFER_TRACE(jh, "not on correct data list: unfile");
1098  J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1099  JBUFFER_TRACE(jh, "file as data");
1100  __journal_file_buffer(jh, handle->h_transaction,
1101  BJ_SyncData);
1102  }
1103  } else {
1104  JBUFFER_TRACE(jh, "not on a transaction");
1105  __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1106  }
1107 no_journal:
1108  spin_unlock(&journal->j_list_lock);
1109  jbd_unlock_bh_state(bh);
1110  if (need_brelse) {
1111  BUFFER_TRACE(bh, "brelse");
1112  __brelse(bh);
1113  }
1114  JBUFFER_TRACE(jh, "exit");
1116  return ret;
1117 }
1118 
1138 int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1139 {
1140  transaction_t *transaction = handle->h_transaction;
1141  journal_t *journal = transaction->t_journal;
1142  struct journal_head *jh = bh2jh(bh);
1143 
1144  jbd_debug(5, "journal_head %p\n", jh);
1145  JBUFFER_TRACE(jh, "entry");
1146  if (is_handle_aborted(handle))
1147  goto out;
1148 
1149  jbd_lock_bh_state(bh);
1150 
1151  if (jh->b_modified == 0) {
1152  /*
1153  * This buffer's got modified and becoming part
1154  * of the transaction. This needs to be done
1155  * once a transaction -bzzz
1156  */
1157  jh->b_modified = 1;
1158  J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1159  handle->h_buffer_credits--;
1160  }
1161 
1162  /*
1163  * fastpath, to avoid expensive locking. If this buffer is already
1164  * on the running transaction's metadata list there is nothing to do.
1165  * Nobody can take it off again because there is a handle open.
1166  * I _think_ we're OK here with SMP barriers - a mistaken decision will
1167  * result in this test being false, so we go in and take the locks.
1168  */
1169  if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1170  JBUFFER_TRACE(jh, "fastpath");
1171  J_ASSERT_JH(jh, jh->b_transaction ==
1172  journal->j_running_transaction);
1173  goto out_unlock_bh;
1174  }
1175 
1176  set_buffer_jbddirty(bh);
1177 
1178  /*
1179  * Metadata already on the current transaction list doesn't
1180  * need to be filed. Metadata on another transaction's list must
1181  * be committing, and will be refiled once the commit completes:
1182  * leave it alone for now.
1183  */
1184  if (jh->b_transaction != transaction) {
1185  JBUFFER_TRACE(jh, "already on other transaction");
1186  J_ASSERT_JH(jh, jh->b_transaction ==
1187  journal->j_committing_transaction);
1188  J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1189  /* And this case is illegal: we can't reuse another
1190  * transaction's data buffer, ever. */
1191  goto out_unlock_bh;
1192  }
1193 
1194  /* That test should have eliminated the following case: */
1195  J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1196 
1197  JBUFFER_TRACE(jh, "file as BJ_Metadata");
1198  spin_lock(&journal->j_list_lock);
1199  __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1200  spin_unlock(&journal->j_list_lock);
1201 out_unlock_bh:
1202  jbd_unlock_bh_state(bh);
1203 out:
1204  JBUFFER_TRACE(jh, "exit");
1205  return 0;
1206 }
1207 
1208 /*
1209  * journal_release_buffer: undo a get_write_access without any buffer
1210  * updates, if the update decided in the end that it didn't need access.
1211  *
1212  */
1213 void
1214 journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1215 {
1216  BUFFER_TRACE(bh, "entry");
1217 }
1218 
1236 int journal_forget (handle_t *handle, struct buffer_head *bh)
1237 {
1238  transaction_t *transaction = handle->h_transaction;
1239  journal_t *journal = transaction->t_journal;
1240  struct journal_head *jh;
1241  int drop_reserve = 0;
1242  int err = 0;
1243  int was_modified = 0;
1244 
1245  BUFFER_TRACE(bh, "entry");
1246 
1247  jbd_lock_bh_state(bh);
1248  spin_lock(&journal->j_list_lock);
1249 
1250  if (!buffer_jbd(bh))
1251  goto not_jbd;
1252  jh = bh2jh(bh);
1253 
1254  /* Critical error: attempting to delete a bitmap buffer, maybe?
1255  * Don't do any jbd operations, and return an error. */
1256  if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1257  "inconsistent data on disk")) {
1258  err = -EIO;
1259  goto not_jbd;
1260  }
1261 
1262  /* keep track of wether or not this transaction modified us */
1263  was_modified = jh->b_modified;
1264 
1265  /*
1266  * The buffer's going from the transaction, we must drop
1267  * all references -bzzz
1268  */
1269  jh->b_modified = 0;
1270 
1271  if (jh->b_transaction == handle->h_transaction) {
1272  J_ASSERT_JH(jh, !jh->b_frozen_data);
1273 
1274  /* If we are forgetting a buffer which is already part
1275  * of this transaction, then we can just drop it from
1276  * the transaction immediately. */
1277  clear_buffer_dirty(bh);
1278  clear_buffer_jbddirty(bh);
1279 
1280  JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1281 
1282  /*
1283  * we only want to drop a reference if this transaction
1284  * modified the buffer
1285  */
1286  if (was_modified)
1287  drop_reserve = 1;
1288 
1289  /*
1290  * We are no longer going to journal this buffer.
1291  * However, the commit of this transaction is still
1292  * important to the buffer: the delete that we are now
1293  * processing might obsolete an old log entry, so by
1294  * committing, we can satisfy the buffer's checkpoint.
1295  *
1296  * So, if we have a checkpoint on the buffer, we should
1297  * now refile the buffer on our BJ_Forget list so that
1298  * we know to remove the checkpoint after we commit.
1299  */
1300 
1301  if (jh->b_cp_transaction) {
1302  __journal_temp_unlink_buffer(jh);
1303  __journal_file_buffer(jh, transaction, BJ_Forget);
1304  } else {
1306  if (!buffer_jbd(bh)) {
1307  spin_unlock(&journal->j_list_lock);
1308  jbd_unlock_bh_state(bh);
1309  __bforget(bh);
1310  goto drop;
1311  }
1312  }
1313  } else if (jh->b_transaction) {
1314  J_ASSERT_JH(jh, (jh->b_transaction ==
1315  journal->j_committing_transaction));
1316  /* However, if the buffer is still owned by a prior
1317  * (committing) transaction, we can't drop it yet... */
1318  JBUFFER_TRACE(jh, "belongs to older transaction");
1319  /* ... but we CAN drop it from the new transaction if we
1320  * have also modified it since the original commit. */
1321 
1322  if (jh->b_next_transaction) {
1323  J_ASSERT(jh->b_next_transaction == transaction);
1324  jh->b_next_transaction = NULL;
1325 
1326  /*
1327  * only drop a reference if this transaction modified
1328  * the buffer
1329  */
1330  if (was_modified)
1331  drop_reserve = 1;
1332  }
1333  }
1334 
1335 not_jbd:
1336  spin_unlock(&journal->j_list_lock);
1337  jbd_unlock_bh_state(bh);
1338  __brelse(bh);
1339 drop:
1340  if (drop_reserve) {
1341  /* no need to reserve log space for this block -bzzz */
1342  handle->h_buffer_credits++;
1343  }
1344  return err;
1345 }
1346 
1363 int journal_stop(handle_t *handle)
1364 {
1365  transaction_t *transaction = handle->h_transaction;
1366  journal_t *journal = transaction->t_journal;
1367  int err;
1368  pid_t pid;
1369 
1370  J_ASSERT(journal_current_handle() == handle);
1371 
1372  if (is_handle_aborted(handle))
1373  err = -EIO;
1374  else {
1375  J_ASSERT(transaction->t_updates > 0);
1376  err = 0;
1377  }
1378 
1379  if (--handle->h_ref > 0) {
1380  jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1381  handle->h_ref);
1382  return err;
1383  }
1384 
1385  jbd_debug(4, "Handle %p going down\n", handle);
1386 
1387  /*
1388  * Implement synchronous transaction batching. If the handle
1389  * was synchronous, don't force a commit immediately. Let's
1390  * yield and let another thread piggyback onto this transaction.
1391  * Keep doing that while new threads continue to arrive.
1392  * It doesn't cost much - we're about to run a commit and sleep
1393  * on IO anyway. Speeds up many-threaded, many-dir operations
1394  * by 30x or more...
1395  *
1396  * We try and optimize the sleep time against what the underlying disk
1397  * can do, instead of having a static sleep time. This is useful for
1398  * the case where our storage is so fast that it is more optimal to go
1399  * ahead and force a flush and wait for the transaction to be committed
1400  * than it is to wait for an arbitrary amount of time for new writers to
1401  * join the transaction. We achieve this by measuring how long it takes
1402  * to commit a transaction, and compare it with how long this
1403  * transaction has been running, and if run time < commit time then we
1404  * sleep for the delta and commit. This greatly helps super fast disks
1405  * that would see slowdowns as more threads started doing fsyncs.
1406  *
1407  * But don't do this if this process was the most recent one to
1408  * perform a synchronous write. We do this to detect the case where a
1409  * single process is doing a stream of sync writes. No point in waiting
1410  * for joiners in that case.
1411  */
1412  pid = current->pid;
1413  if (handle->h_sync && journal->j_last_sync_writer != pid) {
1414  u64 commit_time, trans_time;
1415 
1416  journal->j_last_sync_writer = pid;
1417 
1418  spin_lock(&journal->j_state_lock);
1419  commit_time = journal->j_average_commit_time;
1420  spin_unlock(&journal->j_state_lock);
1421 
1422  trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1423  transaction->t_start_time));
1424 
1425  commit_time = min_t(u64, commit_time,
1426  1000*jiffies_to_usecs(1));
1427 
1428  if (trans_time < commit_time) {
1429  ktime_t expires = ktime_add_ns(ktime_get(),
1430  commit_time);
1433  }
1434  }
1435 
1436  current->journal_info = NULL;
1437  spin_lock(&journal->j_state_lock);
1438  spin_lock(&transaction->t_handle_lock);
1439  transaction->t_outstanding_credits -= handle->h_buffer_credits;
1440  transaction->t_updates--;
1441  if (!transaction->t_updates) {
1442  wake_up(&journal->j_wait_updates);
1443  if (journal->j_barrier_count)
1444  wake_up(&journal->j_wait_transaction_locked);
1445  }
1446 
1447  /*
1448  * If the handle is marked SYNC, we need to set another commit
1449  * going! We also want to force a commit if the current
1450  * transaction is occupying too much of the log, or if the
1451  * transaction is too old now.
1452  */
1453  if (handle->h_sync ||
1454  transaction->t_outstanding_credits >
1455  journal->j_max_transaction_buffers ||
1456  time_after_eq(jiffies, transaction->t_expires)) {
1457  /* Do this even for aborted journals: an abort still
1458  * completes the commit thread, it just doesn't write
1459  * anything to disk. */
1460  tid_t tid = transaction->t_tid;
1461 
1462  spin_unlock(&transaction->t_handle_lock);
1463  jbd_debug(2, "transaction too old, requesting commit for "
1464  "handle %p\n", handle);
1465  /* This is non-blocking */
1466  __log_start_commit(journal, transaction->t_tid);
1467  spin_unlock(&journal->j_state_lock);
1468 
1469  /*
1470  * Special case: JFS_SYNC synchronous updates require us
1471  * to wait for the commit to complete.
1472  */
1473  if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1474  err = log_wait_commit(journal, tid);
1475  } else {
1476  spin_unlock(&transaction->t_handle_lock);
1477  spin_unlock(&journal->j_state_lock);
1478  }
1479 
1480  lock_map_release(&handle->h_lockdep_map);
1481 
1482  jbd_free_handle(handle);
1483  return err;
1484 }
1485 
1494 int journal_force_commit(journal_t *journal)
1495 {
1496  handle_t *handle;
1497  int ret;
1498 
1499  handle = journal_start(journal, 1);
1500  if (IS_ERR(handle)) {
1501  ret = PTR_ERR(handle);
1502  } else {
1503  handle->h_sync = 1;
1504  ret = journal_stop(handle);
1505  }
1506  return ret;
1507 }
1508 
1509 /*
1510  *
1511  * List management code snippets: various functions for manipulating the
1512  * transaction buffer lists.
1513  *
1514  */
1515 
1516 /*
1517  * Append a buffer to a transaction list, given the transaction's list head
1518  * pointer.
1519  *
1520  * j_list_lock is held.
1521  *
1522  * jbd_lock_bh_state(jh2bh(jh)) is held.
1523  */
1524 
1525 static inline void
1526 __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1527 {
1528  if (!*list) {
1529  jh->b_tnext = jh->b_tprev = jh;
1530  *list = jh;
1531  } else {
1532  /* Insert at the tail of the list to preserve order */
1533  struct journal_head *first = *list, *last = first->b_tprev;
1534  jh->b_tprev = last;
1535  jh->b_tnext = first;
1536  last->b_tnext = first->b_tprev = jh;
1537  }
1538 }
1539 
1540 /*
1541  * Remove a buffer from a transaction list, given the transaction's list
1542  * head pointer.
1543  *
1544  * Called with j_list_lock held, and the journal may not be locked.
1545  *
1546  * jbd_lock_bh_state(jh2bh(jh)) is held.
1547  */
1548 
1549 static inline void
1550 __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1551 {
1552  if (*list == jh) {
1553  *list = jh->b_tnext;
1554  if (*list == jh)
1555  *list = NULL;
1556  }
1557  jh->b_tprev->b_tnext = jh->b_tnext;
1558  jh->b_tnext->b_tprev = jh->b_tprev;
1559 }
1560 
1561 /*
1562  * Remove a buffer from the appropriate transaction list.
1563  *
1564  * Note that this function can *change* the value of
1565  * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1566  * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1567  * is holding onto a copy of one of thee pointers, it could go bad.
1568  * Generally the caller needs to re-read the pointer from the transaction_t.
1569  *
1570  * Called under j_list_lock. The journal may not be locked.
1571  */
1572 static void __journal_temp_unlink_buffer(struct journal_head *jh)
1573 {
1574  struct journal_head **list = NULL;
1576  struct buffer_head *bh = jh2bh(jh);
1577 
1578  J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1579  transaction = jh->b_transaction;
1580  if (transaction)
1581  assert_spin_locked(&transaction->t_journal->j_list_lock);
1582 
1583  J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1584  if (jh->b_jlist != BJ_None)
1585  J_ASSERT_JH(jh, transaction != NULL);
1586 
1587  switch (jh->b_jlist) {
1588  case BJ_None:
1589  return;
1590  case BJ_SyncData:
1591  list = &transaction->t_sync_datalist;
1592  break;
1593  case BJ_Metadata:
1594  transaction->t_nr_buffers--;
1595  J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1596  list = &transaction->t_buffers;
1597  break;
1598  case BJ_Forget:
1599  list = &transaction->t_forget;
1600  break;
1601  case BJ_IO:
1602  list = &transaction->t_iobuf_list;
1603  break;
1604  case BJ_Shadow:
1605  list = &transaction->t_shadow_list;
1606  break;
1607  case BJ_LogCtl:
1608  list = &transaction->t_log_list;
1609  break;
1610  case BJ_Reserved:
1611  list = &transaction->t_reserved_list;
1612  break;
1613  case BJ_Locked:
1614  list = &transaction->t_locked_list;
1615  break;
1616  }
1617 
1618  __blist_del_buffer(list, jh);
1619  jh->b_jlist = BJ_None;
1620  if (test_clear_buffer_jbddirty(bh))
1621  mark_buffer_dirty(bh); /* Expose it to the VM */
1622 }
1623 
1624 /*
1625  * Remove buffer from all transactions.
1626  *
1627  * Called with bh_state lock and j_list_lock
1628  *
1629  * jh and bh may be already freed when this function returns.
1630  */
1632 {
1633  __journal_temp_unlink_buffer(jh);
1634  jh->b_transaction = NULL;
1636 }
1637 
1638 void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1639 {
1640  struct buffer_head *bh = jh2bh(jh);
1641 
1642  /* Get reference so that buffer cannot be freed before we unlock it */
1643  get_bh(bh);
1644  jbd_lock_bh_state(bh);
1645  spin_lock(&journal->j_list_lock);
1647  spin_unlock(&journal->j_list_lock);
1648  jbd_unlock_bh_state(bh);
1649  __brelse(bh);
1650 }
1651 
1652 /*
1653  * Called from journal_try_to_free_buffers().
1654  *
1655  * Called under jbd_lock_bh_state(bh)
1656  */
1657 static void
1658 __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1659 {
1660  struct journal_head *jh;
1661 
1662  jh = bh2jh(bh);
1663 
1664  if (buffer_locked(bh) || buffer_dirty(bh))
1665  goto out;
1666 
1667  if (jh->b_next_transaction != NULL)
1668  goto out;
1669 
1670  spin_lock(&journal->j_list_lock);
1671  if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1672  if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1673  /* A written-back ordered data buffer */
1674  JBUFFER_TRACE(jh, "release data");
1676  }
1677  } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1678  /* written-back checkpointed metadata buffer */
1679  if (jh->b_jlist == BJ_None) {
1680  JBUFFER_TRACE(jh, "remove from checkpoint list");
1682  }
1683  }
1684  spin_unlock(&journal->j_list_lock);
1685 out:
1686  return;
1687 }
1688 
1727 int journal_try_to_free_buffers(journal_t *journal,
1728  struct page *page, gfp_t gfp_mask)
1729 {
1730  struct buffer_head *head;
1731  struct buffer_head *bh;
1732  int ret = 0;
1733 
1734  J_ASSERT(PageLocked(page));
1735 
1736  head = page_buffers(page);
1737  bh = head;
1738  do {
1739  struct journal_head *jh;
1740 
1741  /*
1742  * We take our own ref against the journal_head here to avoid
1743  * having to add tons of locking around each instance of
1744  * journal_put_journal_head().
1745  */
1746  jh = journal_grab_journal_head(bh);
1747  if (!jh)
1748  continue;
1749 
1750  jbd_lock_bh_state(bh);
1751  __journal_try_to_free_buffer(journal, bh);
1753  jbd_unlock_bh_state(bh);
1754  if (buffer_jbd(bh))
1755  goto busy;
1756  } while ((bh = bh->b_this_page) != head);
1757 
1758  ret = try_to_free_buffers(page);
1759 
1760 busy:
1761  return ret;
1762 }
1763 
1764 /*
1765  * This buffer is no longer needed. If it is on an older transaction's
1766  * checkpoint list we need to record it on this transaction's forget list
1767  * to pin this buffer (and hence its checkpointing transaction) down until
1768  * this transaction commits. If the buffer isn't on a checkpoint list, we
1769  * release it.
1770  * Returns non-zero if JBD no longer has an interest in the buffer.
1771  *
1772  * Called under j_list_lock.
1773  *
1774  * Called under jbd_lock_bh_state(bh).
1775  */
1776 static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1777 {
1778  int may_free = 1;
1779  struct buffer_head *bh = jh2bh(jh);
1780 
1781  if (jh->b_cp_transaction) {
1782  JBUFFER_TRACE(jh, "on running+cp transaction");
1783  __journal_temp_unlink_buffer(jh);
1784  /*
1785  * We don't want to write the buffer anymore, clear the
1786  * bit so that we don't confuse checks in
1787  * __journal_file_buffer
1788  */
1789  clear_buffer_dirty(bh);
1790  __journal_file_buffer(jh, transaction, BJ_Forget);
1791  may_free = 0;
1792  } else {
1793  JBUFFER_TRACE(jh, "on running transaction");
1795  }
1796  return may_free;
1797 }
1798 
1799 /*
1800  * journal_invalidatepage
1801  *
1802  * This code is tricky. It has a number of cases to deal with.
1803  *
1804  * There are two invariants which this code relies on:
1805  *
1806  * i_size must be updated on disk before we start calling invalidatepage on the
1807  * data.
1808  *
1809  * This is done in ext3 by defining an ext3_setattr method which
1810  * updates i_size before truncate gets going. By maintaining this
1811  * invariant, we can be sure that it is safe to throw away any buffers
1812  * attached to the current transaction: once the transaction commits,
1813  * we know that the data will not be needed.
1814  *
1815  * Note however that we can *not* throw away data belonging to the
1816  * previous, committing transaction!
1817  *
1818  * Any disk blocks which *are* part of the previous, committing
1819  * transaction (and which therefore cannot be discarded immediately) are
1820  * not going to be reused in the new running transaction
1821  *
1822  * The bitmap committed_data images guarantee this: any block which is
1823  * allocated in one transaction and removed in the next will be marked
1824  * as in-use in the committed_data bitmap, so cannot be reused until
1825  * the next transaction to delete the block commits. This means that
1826  * leaving committing buffers dirty is quite safe: the disk blocks
1827  * cannot be reallocated to a different file and so buffer aliasing is
1828  * not possible.
1829  *
1830  *
1831  * The above applies mainly to ordered data mode. In writeback mode we
1832  * don't make guarantees about the order in which data hits disk --- in
1833  * particular we don't guarantee that new dirty data is flushed before
1834  * transaction commit --- so it is always safe just to discard data
1835  * immediately in that mode. --sct
1836  */
1837 
1838 /*
1839  * The journal_unmap_buffer helper function returns zero if the buffer
1840  * concerned remains pinned as an anonymous buffer belonging to an older
1841  * transaction.
1842  *
1843  * We're outside-transaction here. Either or both of j_running_transaction
1844  * and j_committing_transaction may be NULL.
1845  */
1846 static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1847  int partial_page)
1848 {
1850  struct journal_head *jh;
1851  int may_free = 1;
1852 
1853  BUFFER_TRACE(bh, "entry");
1854 
1855 retry:
1856  /*
1857  * It is safe to proceed here without the j_list_lock because the
1858  * buffers cannot be stolen by try_to_free_buffers as long as we are
1859  * holding the page lock. --sct
1860  */
1861 
1862  if (!buffer_jbd(bh))
1863  goto zap_buffer_unlocked;
1864 
1865  spin_lock(&journal->j_state_lock);
1866  jbd_lock_bh_state(bh);
1867  spin_lock(&journal->j_list_lock);
1868 
1869  jh = journal_grab_journal_head(bh);
1870  if (!jh)
1871  goto zap_buffer_no_jh;
1872 
1873  /*
1874  * We cannot remove the buffer from checkpoint lists until the
1875  * transaction adding inode to orphan list (let's call it T)
1876  * is committed. Otherwise if the transaction changing the
1877  * buffer would be cleaned from the journal before T is
1878  * committed, a crash will cause that the correct contents of
1879  * the buffer will be lost. On the other hand we have to
1880  * clear the buffer dirty bit at latest at the moment when the
1881  * transaction marking the buffer as freed in the filesystem
1882  * structures is committed because from that moment on the
1883  * block can be reallocated and used by a different page.
1884  * Since the block hasn't been freed yet but the inode has
1885  * already been added to orphan list, it is safe for us to add
1886  * the buffer to BJ_Forget list of the newest transaction.
1887  *
1888  * Also we have to clear buffer_mapped flag of a truncated buffer
1889  * because the buffer_head may be attached to the page straddling
1890  * i_size (can happen only when blocksize < pagesize) and thus the
1891  * buffer_head can be reused when the file is extended again. So we end
1892  * up keeping around invalidated buffers attached to transactions'
1893  * BJ_Forget list just to stop checkpointing code from cleaning up
1894  * the transaction this buffer was modified in.
1895  */
1896  transaction = jh->b_transaction;
1897  if (transaction == NULL) {
1898  /* First case: not on any transaction. If it
1899  * has no checkpoint link, then we can zap it:
1900  * it's a writeback-mode buffer so we don't care
1901  * if it hits disk safely. */
1902  if (!jh->b_cp_transaction) {
1903  JBUFFER_TRACE(jh, "not on any transaction: zap");
1904  goto zap_buffer;
1905  }
1906 
1907  if (!buffer_dirty(bh)) {
1908  /* bdflush has written it. We can drop it now */
1909  goto zap_buffer;
1910  }
1911 
1912  /* OK, it must be in the journal but still not
1913  * written fully to disk: it's metadata or
1914  * journaled data... */
1915 
1916  if (journal->j_running_transaction) {
1917  /* ... and once the current transaction has
1918  * committed, the buffer won't be needed any
1919  * longer. */
1920  JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1921  may_free = __dispose_buffer(jh,
1922  journal->j_running_transaction);
1923  goto zap_buffer;
1924  } else {
1925  /* There is no currently-running transaction. So the
1926  * orphan record which we wrote for this file must have
1927  * passed into commit. We must attach this buffer to
1928  * the committing transaction, if it exists. */
1929  if (journal->j_committing_transaction) {
1930  JBUFFER_TRACE(jh, "give to committing trans");
1931  may_free = __dispose_buffer(jh,
1932  journal->j_committing_transaction);
1933  goto zap_buffer;
1934  } else {
1935  /* The orphan record's transaction has
1936  * committed. We can cleanse this buffer */
1937  clear_buffer_jbddirty(bh);
1938  goto zap_buffer;
1939  }
1940  }
1941  } else if (transaction == journal->j_committing_transaction) {
1942  JBUFFER_TRACE(jh, "on committing transaction");
1943  if (jh->b_jlist == BJ_Locked) {
1944  /*
1945  * The buffer is on the committing transaction's locked
1946  * list. We have the buffer locked, so I/O has
1947  * completed. So we can nail the buffer now.
1948  */
1949  may_free = __dispose_buffer(jh, transaction);
1950  goto zap_buffer;
1951  }
1952  /*
1953  * The buffer is committing, we simply cannot touch
1954  * it. If the page is straddling i_size we have to wait
1955  * for commit and try again.
1956  */
1957  if (partial_page) {
1958  tid_t tid = journal->j_committing_transaction->t_tid;
1959 
1961  spin_unlock(&journal->j_list_lock);
1962  jbd_unlock_bh_state(bh);
1963  spin_unlock(&journal->j_state_lock);
1964  unlock_buffer(bh);
1965  log_wait_commit(journal, tid);
1966  lock_buffer(bh);
1967  goto retry;
1968  }
1969  /*
1970  * OK, buffer won't be reachable after truncate. We just set
1971  * j_next_transaction to the running transaction (if there is
1972  * one) and mark buffer as freed so that commit code knows it
1973  * should clear dirty bits when it is done with the buffer.
1974  */
1975  set_buffer_freed(bh);
1976  if (journal->j_running_transaction && buffer_jbddirty(bh))
1977  jh->b_next_transaction = journal->j_running_transaction;
1979  spin_unlock(&journal->j_list_lock);
1980  jbd_unlock_bh_state(bh);
1981  spin_unlock(&journal->j_state_lock);
1982  return 0;
1983  } else {
1984  /* Good, the buffer belongs to the running transaction.
1985  * We are writing our own transaction's data, not any
1986  * previous one's, so it is safe to throw it away
1987  * (remember that we expect the filesystem to have set
1988  * i_size already for this truncate so recovery will not
1989  * expose the disk blocks we are discarding here.) */
1990  J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1991  JBUFFER_TRACE(jh, "on running transaction");
1992  may_free = __dispose_buffer(jh, transaction);
1993  }
1994 
1995 zap_buffer:
1996  /*
1997  * This is tricky. Although the buffer is truncated, it may be reused
1998  * if blocksize < pagesize and it is attached to the page straddling
1999  * EOF. Since the buffer might have been added to BJ_Forget list of the
2000  * running transaction, journal_get_write_access() won't clear
2001  * b_modified and credit accounting gets confused. So clear b_modified
2002  * here. */
2003  jh->b_modified = 0;
2005 zap_buffer_no_jh:
2006  spin_unlock(&journal->j_list_lock);
2007  jbd_unlock_bh_state(bh);
2008  spin_unlock(&journal->j_state_lock);
2009 zap_buffer_unlocked:
2010  clear_buffer_dirty(bh);
2011  J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2012  clear_buffer_mapped(bh);
2013  clear_buffer_req(bh);
2014  clear_buffer_new(bh);
2015  bh->b_bdev = NULL;
2016  return may_free;
2017 }
2018 
2027 void journal_invalidatepage(journal_t *journal,
2028  struct page *page,
2029  unsigned long offset)
2030 {
2031  struct buffer_head *head, *bh, *next;
2032  unsigned int curr_off = 0;
2033  int may_free = 1;
2034 
2035  if (!PageLocked(page))
2036  BUG();
2037  if (!page_has_buffers(page))
2038  return;
2039 
2040  /* We will potentially be playing with lists other than just the
2041  * data lists (especially for journaled data mode), so be
2042  * cautious in our locking. */
2043 
2044  head = bh = page_buffers(page);
2045  do {
2046  unsigned int next_off = curr_off + bh->b_size;
2047  next = bh->b_this_page;
2048 
2049  if (offset <= curr_off) {
2050  /* This block is wholly outside the truncation point */
2051  lock_buffer(bh);
2052  may_free &= journal_unmap_buffer(journal, bh,
2053  offset > 0);
2054  unlock_buffer(bh);
2055  }
2056  curr_off = next_off;
2057  bh = next;
2058 
2059  } while (bh != head);
2060 
2061  if (!offset) {
2062  if (may_free && try_to_free_buffers(page))
2063  J_ASSERT(!page_has_buffers(page));
2064  }
2065 }
2066 
2067 /*
2068  * File a buffer on the given transaction list.
2069  */
2071  transaction_t *transaction, int jlist)
2072 {
2073  struct journal_head **list = NULL;
2074  int was_dirty = 0;
2075  struct buffer_head *bh = jh2bh(jh);
2076 
2077  J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2078  assert_spin_locked(&transaction->t_journal->j_list_lock);
2079 
2080  J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2081  J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2082  jh->b_transaction == NULL);
2083 
2084  if (jh->b_transaction && jh->b_jlist == jlist)
2085  return;
2086 
2087  if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2088  jlist == BJ_Shadow || jlist == BJ_Forget) {
2089  /*
2090  * For metadata buffers, we track dirty bit in buffer_jbddirty
2091  * instead of buffer_dirty. We should not see a dirty bit set
2092  * here because we clear it in do_get_write_access but e.g.
2093  * tune2fs can modify the sb and set the dirty bit at any time
2094  * so we try to gracefully handle that.
2095  */
2096  if (buffer_dirty(bh))
2097  warn_dirty_buffer(bh);
2098  if (test_clear_buffer_dirty(bh) ||
2099  test_clear_buffer_jbddirty(bh))
2100  was_dirty = 1;
2101  }
2102 
2103  if (jh->b_transaction)
2104  __journal_temp_unlink_buffer(jh);
2105  else
2107  jh->b_transaction = transaction;
2108 
2109  switch (jlist) {
2110  case BJ_None:
2111  J_ASSERT_JH(jh, !jh->b_committed_data);
2112  J_ASSERT_JH(jh, !jh->b_frozen_data);
2113  return;
2114  case BJ_SyncData:
2115  list = &transaction->t_sync_datalist;
2116  break;
2117  case BJ_Metadata:
2118  transaction->t_nr_buffers++;
2119  list = &transaction->t_buffers;
2120  break;
2121  case BJ_Forget:
2122  list = &transaction->t_forget;
2123  break;
2124  case BJ_IO:
2125  list = &transaction->t_iobuf_list;
2126  break;
2127  case BJ_Shadow:
2128  list = &transaction->t_shadow_list;
2129  break;
2130  case BJ_LogCtl:
2131  list = &transaction->t_log_list;
2132  break;
2133  case BJ_Reserved:
2134  list = &transaction->t_reserved_list;
2135  break;
2136  case BJ_Locked:
2137  list = &transaction->t_locked_list;
2138  break;
2139  }
2140 
2141  __blist_add_buffer(list, jh);
2142  jh->b_jlist = jlist;
2143 
2144  if (was_dirty)
2145  set_buffer_jbddirty(bh);
2146 }
2147 
2149  transaction_t *transaction, int jlist)
2150 {
2151  jbd_lock_bh_state(jh2bh(jh));
2152  spin_lock(&transaction->t_journal->j_list_lock);
2153  __journal_file_buffer(jh, transaction, jlist);
2154  spin_unlock(&transaction->t_journal->j_list_lock);
2155  jbd_unlock_bh_state(jh2bh(jh));
2156 }
2157 
2158 /*
2159  * Remove a buffer from its current buffer list in preparation for
2160  * dropping it from its current transaction entirely. If the buffer has
2161  * already started to be used by a subsequent transaction, refile the
2162  * buffer on that transaction's metadata list.
2163  *
2164  * Called under j_list_lock
2165  * Called under jbd_lock_bh_state(jh2bh(jh))
2166  *
2167  * jh and bh may be already free when this function returns
2168  */
2170 {
2171  int was_dirty, jlist;
2172  struct buffer_head *bh = jh2bh(jh);
2173 
2174  J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2175  if (jh->b_transaction)
2176  assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2177 
2178  /* If the buffer is now unused, just drop it. */
2179  if (jh->b_next_transaction == NULL) {
2181  return;
2182  }
2183 
2184  /*
2185  * It has been modified by a later transaction: add it to the new
2186  * transaction's metadata list.
2187  */
2188 
2189  was_dirty = test_clear_buffer_jbddirty(bh);
2190  __journal_temp_unlink_buffer(jh);
2191  /*
2192  * We set b_transaction here because b_next_transaction will inherit
2193  * our jh reference and thus __journal_file_buffer() must not take a
2194  * new one.
2195  */
2197  jh->b_next_transaction = NULL;
2198  if (buffer_freed(bh))
2199  jlist = BJ_Forget;
2200  else if (jh->b_modified)
2201  jlist = BJ_Metadata;
2202  else
2203  jlist = BJ_Reserved;
2204  __journal_file_buffer(jh, jh->b_transaction, jlist);
2205  J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2206 
2207  if (was_dirty)
2208  set_buffer_jbddirty(bh);
2209 }
2210 
2211 /*
2212  * __journal_refile_buffer() with necessary locking added. We take our bh
2213  * reference so that we can safely unlock bh.
2214  *
2215  * The jh and bh may be freed by this call.
2216  */
2217 void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2218 {
2219  struct buffer_head *bh = jh2bh(jh);
2220 
2221  /* Get reference so that buffer cannot be freed before we unlock it */
2222  get_bh(bh);
2223  jbd_lock_bh_state(bh);
2224  spin_lock(&journal->j_list_lock);
2226  jbd_unlock_bh_state(bh);
2227  spin_unlock(&journal->j_list_lock);
2228  __brelse(bh);
2229 }