Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
commit.c
Go to the documentation of this file.
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <[email protected]>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37  BUFFER_TRACE(bh, "");
38  if (uptodate)
39  set_buffer_uptodate(bh);
40  else
41  clear_buffer_uptodate(bh);
42  unlock_buffer(bh);
43 }
44 
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers. These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list. Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock. The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
59 static void release_buffer_page(struct buffer_head *bh)
60 {
61  struct page *page;
62 
63  if (buffer_dirty(bh))
64  goto nope;
65  if (atomic_read(&bh->b_count) != 1)
66  goto nope;
67  page = bh->b_page;
68  if (!page)
69  goto nope;
70  if (page->mapping)
71  goto nope;
72 
73  /* OK, it's a truncated page */
74  if (!trylock_page(page))
75  goto nope;
76 
77  page_cache_get(page);
78  __brelse(bh);
79  try_to_free_buffers(page);
80  unlock_page(page);
81  page_cache_release(page);
82  return;
83 
84 nope:
85  __brelse(bh);
86 }
87 
88 static void jbd2_commit_block_csum_set(journal_t *j,
89  struct journal_head *descriptor)
90 {
91  struct commit_header *h;
92  __u32 csum;
93 
95  return;
96 
97  h = (struct commit_header *)(jh2bh(descriptor)->b_data);
98  h->h_chksum_type = 0;
99  h->h_chksum_size = 0;
100  h->h_chksum[0] = 0;
101  csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
102  j->j_blocksize);
103  h->h_chksum[0] = cpu_to_be32(csum);
104 }
105 
106 /*
107  * Done it all: now submit the commit record. We should have
108  * cleaned up our previous buffers by now, so if we are in abort
109  * mode we can now just skip the rest of the journal write
110  * entirely.
111  *
112  * Returns 1 if the journal needs to be aborted or 0 on success
113  */
114 static int journal_submit_commit_record(journal_t *journal,
115  transaction_t *commit_transaction,
116  struct buffer_head **cbh,
117  __u32 crc32_sum)
118 {
119  struct journal_head *descriptor;
120  struct commit_header *tmp;
121  struct buffer_head *bh;
122  int ret;
123  struct timespec now = current_kernel_time();
124 
125  *cbh = NULL;
126 
127  if (is_journal_aborted(journal))
128  return 0;
129 
130  descriptor = jbd2_journal_get_descriptor_buffer(journal);
131  if (!descriptor)
132  return 1;
133 
134  bh = jh2bh(descriptor);
135 
136  tmp = (struct commit_header *)bh->b_data;
139  tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
140  tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141  tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143  if (JBD2_HAS_COMPAT_FEATURE(journal,
147  tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
148  }
149  jbd2_commit_block_csum_set(journal, descriptor);
150 
151  JBUFFER_TRACE(descriptor, "submit commit block");
152  lock_buffer(bh);
153  clear_buffer_dirty(bh);
154  set_buffer_uptodate(bh);
155  bh->b_end_io = journal_end_buffer_io_sync;
156 
157  if (journal->j_flags & JBD2_BARRIER &&
158  !JBD2_HAS_INCOMPAT_FEATURE(journal,
160  ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
161  else
162  ret = submit_bh(WRITE_SYNC, bh);
163 
164  *cbh = bh;
165  return ret;
166 }
167 
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(journal_t *journal,
173  struct buffer_head *bh)
174 {
175  int ret = 0;
176 
177  clear_buffer_dirty(bh);
178  wait_on_buffer(bh);
179 
180  if (unlikely(!buffer_uptodate(bh)))
181  ret = -EIO;
182  put_bh(bh); /* One for getblk() */
184 
185  return ret;
186 }
187 
188 /*
189  * write the filemap data using writepage() address_space_operations.
190  * We don't do block allocation here even for delalloc. We don't
191  * use writepages() because with dealyed allocation we may be doing
192  * block allocation in writepages().
193  */
194 static int journal_submit_inode_data_buffers(struct address_space *mapping)
195 {
196  int ret;
197  struct writeback_control wbc = {
199  .nr_to_write = mapping->nrpages * 2,
200  .range_start = 0,
201  .range_end = i_size_read(mapping->host),
202  };
203 
204  ret = generic_writepages(mapping, &wbc);
205  return ret;
206 }
207 
208 /*
209  * Submit all the data buffers of inode associated with the transaction to
210  * disk.
211  *
212  * We are in a committing transaction. Therefore no new inode can be added to
213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
214  * operate on from being released while we write out pages.
215  */
216 static int journal_submit_data_buffers(journal_t *journal,
217  transaction_t *commit_transaction)
218 {
219  struct jbd2_inode *jinode;
220  int err, ret = 0;
221  struct address_space *mapping;
222 
223  spin_lock(&journal->j_list_lock);
224  list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
225  mapping = jinode->i_vfs_inode->i_mapping;
226  set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
227  spin_unlock(&journal->j_list_lock);
228  /*
229  * submit the inode data buffers. We use writepage
230  * instead of writepages. Because writepages can do
231  * block allocation with delalloc. We need to write
232  * only allocated blocks here.
233  */
234  trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
235  err = journal_submit_inode_data_buffers(mapping);
236  if (!ret)
237  ret = err;
238  spin_lock(&journal->j_list_lock);
239  J_ASSERT(jinode->i_transaction == commit_transaction);
240  clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
242  wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243  }
244  spin_unlock(&journal->j_list_lock);
245  return ret;
246 }
247 
248 /*
249  * Wait for data submitted for writeout, refile inodes to proper
250  * transaction if needed.
251  *
252  */
253 static int journal_finish_inode_data_buffers(journal_t *journal,
254  transaction_t *commit_transaction)
255 {
256  struct jbd2_inode *jinode, *next_i;
257  int err, ret = 0;
258 
259  /* For locking, see the comment in journal_submit_data_buffers() */
260  spin_lock(&journal->j_list_lock);
261  list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262  set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
263  spin_unlock(&journal->j_list_lock);
264  err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265  if (err) {
266  /*
267  * Because AS_EIO is cleared by
268  * filemap_fdatawait_range(), set it again so
269  * that user process can get -EIO from fsync().
270  */
271  set_bit(AS_EIO,
272  &jinode->i_vfs_inode->i_mapping->flags);
273 
274  if (!ret)
275  ret = err;
276  }
277  spin_lock(&journal->j_list_lock);
278  clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
280  wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281  }
282 
283  /* Now refile inode to proper lists */
284  list_for_each_entry_safe(jinode, next_i,
285  &commit_transaction->t_inode_list, i_list) {
286  list_del(&jinode->i_list);
287  if (jinode->i_next_transaction) {
288  jinode->i_transaction = jinode->i_next_transaction;
289  jinode->i_next_transaction = NULL;
290  list_add(&jinode->i_list,
291  &jinode->i_transaction->t_inode_list);
292  } else {
293  jinode->i_transaction = NULL;
294  }
295  }
296  spin_unlock(&journal->j_list_lock);
297 
298  return ret;
299 }
300 
301 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 {
303  struct page *page = bh->b_page;
304  char *addr;
305  __u32 checksum;
306 
307  addr = kmap_atomic(page);
308  checksum = crc32_be(crc32_sum,
309  (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
310  kunmap_atomic(addr);
311 
312  return checksum;
313 }
314 
315 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
316  unsigned long long block)
317 {
318  tag->t_blocknr = cpu_to_be32(block & (u32)~0);
319  if (tag_bytes > JBD2_TAG_SIZE32)
320  tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
321 }
322 
323 static void jbd2_descr_block_csum_set(journal_t *j,
324  struct journal_head *descriptor)
325 {
327  __u32 csum;
328 
330  return;
331 
332  tail = (struct jbd2_journal_block_tail *)
333  (jh2bh(descriptor)->b_data + j->j_blocksize -
334  sizeof(struct jbd2_journal_block_tail));
335  tail->t_checksum = 0;
336  csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
337  j->j_blocksize);
338  tail->t_checksum = cpu_to_be32(csum);
339 }
340 
341 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
342  struct buffer_head *bh, __u32 sequence)
343 {
344  struct page *page = bh->b_page;
345  __u8 *addr;
346  __u32 csum;
347 
349  return;
350 
351  sequence = cpu_to_be32(sequence);
352  addr = kmap_atomic(page);
353  csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354  sizeof(sequence));
355  csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
356  bh->b_size);
357  kunmap_atomic(addr);
358 
359  tag->t_checksum = cpu_to_be32(csum);
360 }
361 /*
362  * jbd2_journal_commit_transaction
363  *
364  * The primary function for committing a transaction to the log. This
365  * function is called by the journal thread to begin a complete commit.
366  */
367 void jbd2_journal_commit_transaction(journal_t *journal)
368 {
369  struct transaction_stats_s stats;
370  transaction_t *commit_transaction;
371  struct journal_head *jh, *new_jh, *descriptor;
372  struct buffer_head **wbuf = journal->j_wbuf;
373  int bufs;
374  int flags;
375  int err;
376  unsigned long long blocknr;
378  u64 commit_time;
379  char *tagp = NULL;
381  journal_block_tag_t *tag = NULL;
382  int space_left = 0;
383  int first_tag = 0;
384  int tag_flag;
385  int i, to_free = 0;
386  int tag_bytes = journal_tag_bytes(journal);
387  struct buffer_head *cbh = NULL; /* For transactional checksums */
388  __u32 crc32_sum = ~0;
389  struct blk_plug plug;
390  /* Tail of the journal */
391  unsigned long first_block;
392  tid_t first_tid;
393  int update_tail;
394  int csum_size = 0;
395 
397  csum_size = sizeof(struct jbd2_journal_block_tail);
398 
399  /*
400  * First job: lock down the current transaction and wait for
401  * all outstanding updates to complete.
402  */
403 
404  /* Do we need to erase the effects of a prior jbd2_journal_flush? */
405  if (journal->j_flags & JBD2_FLUSHED) {
406  jbd_debug(3, "super block updated\n");
407  mutex_lock(&journal->j_checkpoint_mutex);
408  /*
409  * We hold j_checkpoint_mutex so tail cannot change under us.
410  * We don't need any special data guarantees for writing sb
411  * since journal is empty and it is ok for write to be
412  * flushed only with transaction commit.
413  */
415  journal->j_tail_sequence,
416  journal->j_tail,
417  WRITE_SYNC);
418  mutex_unlock(&journal->j_checkpoint_mutex);
419  } else {
420  jbd_debug(3, "superblock not updated\n");
421  }
422 
423  J_ASSERT(journal->j_running_transaction != NULL);
424  J_ASSERT(journal->j_committing_transaction == NULL);
425 
426  commit_transaction = journal->j_running_transaction;
427  J_ASSERT(commit_transaction->t_state == T_RUNNING);
428 
429  trace_jbd2_start_commit(journal, commit_transaction);
430  jbd_debug(1, "JBD2: starting commit of transaction %d\n",
431  commit_transaction->t_tid);
432 
433  write_lock(&journal->j_state_lock);
434  commit_transaction->t_state = T_LOCKED;
435 
436  trace_jbd2_commit_locking(journal, commit_transaction);
437  stats.run.rs_wait = commit_transaction->t_max_wait;
438  stats.run.rs_locked = jiffies;
439  stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
440  stats.run.rs_locked);
441 
442  spin_lock(&commit_transaction->t_handle_lock);
443  while (atomic_read(&commit_transaction->t_updates)) {
444  DEFINE_WAIT(wait);
445 
446  prepare_to_wait(&journal->j_wait_updates, &wait,
448  if (atomic_read(&commit_transaction->t_updates)) {
449  spin_unlock(&commit_transaction->t_handle_lock);
450  write_unlock(&journal->j_state_lock);
451  schedule();
452  write_lock(&journal->j_state_lock);
453  spin_lock(&commit_transaction->t_handle_lock);
454  }
455  finish_wait(&journal->j_wait_updates, &wait);
456  }
457  spin_unlock(&commit_transaction->t_handle_lock);
458 
459  J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
460  journal->j_max_transaction_buffers);
461 
462  /*
463  * First thing we are allowed to do is to discard any remaining
464  * BJ_Reserved buffers. Note, it is _not_ permissible to assume
465  * that there are no such buffers: if a large filesystem
466  * operation like a truncate needs to split itself over multiple
467  * transactions, then it may try to do a jbd2_journal_restart() while
468  * there are still BJ_Reserved buffers outstanding. These must
469  * be released cleanly from the current transaction.
470  *
471  * In this case, the filesystem must still reserve write access
472  * again before modifying the buffer in the new transaction, but
473  * we do not require it to remember exactly which old buffers it
474  * has reserved. This is consistent with the existing behaviour
475  * that multiple jbd2_journal_get_write_access() calls to the same
476  * buffer are perfectly permissible.
477  */
478  while (commit_transaction->t_reserved_list) {
479  jh = commit_transaction->t_reserved_list;
480  JBUFFER_TRACE(jh, "reserved, unused: refile");
481  /*
482  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
483  * leave undo-committed data.
484  */
485  if (jh->b_committed_data) {
486  struct buffer_head *bh = jh2bh(jh);
487 
488  jbd_lock_bh_state(bh);
489  jbd2_free(jh->b_committed_data, bh->b_size);
490  jh->b_committed_data = NULL;
491  jbd_unlock_bh_state(bh);
492  }
493  jbd2_journal_refile_buffer(journal, jh);
494  }
495 
496  /*
497  * Now try to drop any written-back buffers from the journal's
498  * checkpoint lists. We do this *before* commit because it potentially
499  * frees some memory
500  */
501  spin_lock(&journal->j_list_lock);
503  spin_unlock(&journal->j_list_lock);
504 
505  jbd_debug(3, "JBD2: commit phase 1\n");
506 
507  /*
508  * Clear revoked flag to reflect there is no revoked buffers
509  * in the next transaction which is going to be started.
510  */
511  jbd2_clear_buffer_revoked_flags(journal);
512 
513  /*
514  * Switch to a new revoke table.
515  */
516  jbd2_journal_switch_revoke_table(journal);
517 
518  trace_jbd2_commit_flushing(journal, commit_transaction);
519  stats.run.rs_flushing = jiffies;
520  stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
521  stats.run.rs_flushing);
522 
523  commit_transaction->t_state = T_FLUSH;
524  journal->j_committing_transaction = commit_transaction;
525  journal->j_running_transaction = NULL;
526  start_time = ktime_get();
527  commit_transaction->t_log_start = journal->j_head;
528  wake_up(&journal->j_wait_transaction_locked);
529  write_unlock(&journal->j_state_lock);
530 
531  jbd_debug(3, "JBD2: commit phase 2\n");
532 
533  /*
534  * Now start flushing things to disk, in the order they appear
535  * on the transaction lists. Data blocks go first.
536  */
537  err = journal_submit_data_buffers(journal, commit_transaction);
538  if (err)
539  jbd2_journal_abort(journal, err);
540 
541  blk_start_plug(&plug);
542  jbd2_journal_write_revoke_records(journal, commit_transaction,
543  WRITE_SYNC);
544  blk_finish_plug(&plug);
545 
546  jbd_debug(3, "JBD2: commit phase 2\n");
547 
548  /*
549  * Way to go: we have now written out all of the data for a
550  * transaction! Now comes the tricky part: we need to write out
551  * metadata. Loop over the transaction's entire buffer list:
552  */
553  write_lock(&journal->j_state_lock);
554  commit_transaction->t_state = T_COMMIT;
555  write_unlock(&journal->j_state_lock);
556 
557  trace_jbd2_commit_logging(journal, commit_transaction);
558  stats.run.rs_logging = jiffies;
559  stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
560  stats.run.rs_logging);
561  stats.run.rs_blocks =
562  atomic_read(&commit_transaction->t_outstanding_credits);
563  stats.run.rs_blocks_logged = 0;
564 
565  J_ASSERT(commit_transaction->t_nr_buffers <=
566  atomic_read(&commit_transaction->t_outstanding_credits));
567 
568  err = 0;
569  descriptor = NULL;
570  bufs = 0;
571  blk_start_plug(&plug);
572  while (commit_transaction->t_buffers) {
573 
574  /* Find the next buffer to be journaled... */
575 
576  jh = commit_transaction->t_buffers;
577 
578  /* If we're in abort mode, we just un-journal the buffer and
579  release it. */
580 
581  if (is_journal_aborted(journal)) {
582  clear_buffer_jbddirty(jh2bh(jh));
583  JBUFFER_TRACE(jh, "journal is aborting: refile");
585  jh->b_frozen_data ?
586  jh->b_frozen_triggers :
587  jh->b_triggers);
588  jbd2_journal_refile_buffer(journal, jh);
589  /* If that was the last one, we need to clean up
590  * any descriptor buffers which may have been
591  * already allocated, even if we are now
592  * aborting. */
593  if (!commit_transaction->t_buffers)
594  goto start_journal_io;
595  continue;
596  }
597 
598  /* Make sure we have a descriptor block in which to
599  record the metadata buffer. */
600 
601  if (!descriptor) {
602  struct buffer_head *bh;
603 
604  J_ASSERT (bufs == 0);
605 
606  jbd_debug(4, "JBD2: get descriptor\n");
607 
608  descriptor = jbd2_journal_get_descriptor_buffer(journal);
609  if (!descriptor) {
610  jbd2_journal_abort(journal, -EIO);
611  continue;
612  }
613 
614  bh = jh2bh(descriptor);
615  jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
616  (unsigned long long)bh->b_blocknr, bh->b_data);
617  header = (journal_header_t *)&bh->b_data[0];
620  header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
621 
622  tagp = &bh->b_data[sizeof(journal_header_t)];
623  space_left = bh->b_size - sizeof(journal_header_t);
624  first_tag = 1;
625  set_buffer_jwrite(bh);
626  set_buffer_dirty(bh);
627  wbuf[bufs++] = bh;
628 
629  /* Record it so that we can wait for IO
630  completion later */
631  BUFFER_TRACE(bh, "ph3: file as descriptor");
632  jbd2_journal_file_buffer(descriptor, commit_transaction,
633  BJ_LogCtl);
634  }
635 
636  /* Where is the buffer to be written? */
637 
638  err = jbd2_journal_next_log_block(journal, &blocknr);
639  /* If the block mapping failed, just abandon the buffer
640  and repeat this loop: we'll fall into the
641  refile-on-abort condition above. */
642  if (err) {
643  jbd2_journal_abort(journal, err);
644  continue;
645  }
646 
647  /*
648  * start_this_handle() uses t_outstanding_credits to determine
649  * the free space in the log, but this counter is changed
650  * by jbd2_journal_next_log_block() also.
651  */
652  atomic_dec(&commit_transaction->t_outstanding_credits);
653 
654  /* Bump b_count to prevent truncate from stumbling over
655  the shadowed buffer! @@@ This can go if we ever get
656  rid of the BJ_IO/BJ_Shadow pairing of buffers. */
657  atomic_inc(&jh2bh(jh)->b_count);
658 
659  /* Make a temporary IO buffer with which to write it out
660  (this will requeue both the metadata buffer and the
661  temporary IO buffer). new_bh goes on BJ_IO*/
662 
663  set_bit(BH_JWrite, &jh2bh(jh)->b_state);
664  /*
665  * akpm: jbd2_journal_write_metadata_buffer() sets
666  * new_bh->b_transaction to commit_transaction.
667  * We need to clean this up before we release new_bh
668  * (which is of type BJ_IO)
669  */
670  JBUFFER_TRACE(jh, "ph3: write metadata");
671  flags = jbd2_journal_write_metadata_buffer(commit_transaction,
672  jh, &new_jh, blocknr);
673  if (flags < 0) {
674  jbd2_journal_abort(journal, flags);
675  continue;
676  }
677  set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
678  wbuf[bufs++] = jh2bh(new_jh);
679 
680  /* Record the new block's tag in the current descriptor
681  buffer */
682 
683  tag_flag = 0;
684  if (flags & 1)
685  tag_flag |= JBD2_FLAG_ESCAPE;
686  if (!first_tag)
687  tag_flag |= JBD2_FLAG_SAME_UUID;
688 
689  tag = (journal_block_tag_t *) tagp;
690  write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
691  tag->t_flags = cpu_to_be16(tag_flag);
692  jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
693  commit_transaction->t_tid);
694  tagp += tag_bytes;
695  space_left -= tag_bytes;
696 
697  if (first_tag) {
698  memcpy (tagp, journal->j_uuid, 16);
699  tagp += 16;
700  space_left -= 16;
701  first_tag = 0;
702  }
703 
704  /* If there's no more to do, or if the descriptor is full,
705  let the IO rip! */
706 
707  if (bufs == journal->j_wbufsize ||
708  commit_transaction->t_buffers == NULL ||
709  space_left < tag_bytes + 16 + csum_size) {
710 
711  jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
712 
713  /* Write an end-of-descriptor marker before
714  submitting the IOs. "tag" still points to
715  the last tag we set up. */
716 
718 
719  jbd2_descr_block_csum_set(journal, descriptor);
720 start_journal_io:
721  for (i = 0; i < bufs; i++) {
722  struct buffer_head *bh = wbuf[i];
723  /*
724  * Compute checksum.
725  */
726  if (JBD2_HAS_COMPAT_FEATURE(journal,
728  crc32_sum =
729  jbd2_checksum_data(crc32_sum, bh);
730  }
731 
732  lock_buffer(bh);
733  clear_buffer_dirty(bh);
734  set_buffer_uptodate(bh);
735  bh->b_end_io = journal_end_buffer_io_sync;
736  submit_bh(WRITE_SYNC, bh);
737  }
738  cond_resched();
739  stats.run.rs_blocks_logged += bufs;
740 
741  /* Force a new descriptor to be generated next
742  time round the loop. */
743  descriptor = NULL;
744  bufs = 0;
745  }
746  }
747 
748  err = journal_finish_inode_data_buffers(journal, commit_transaction);
749  if (err) {
751  "JBD2: Detected IO errors while flushing file data "
752  "on %s\n", journal->j_devname);
753  if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
754  jbd2_journal_abort(journal, err);
755  err = 0;
756  }
757 
758  /*
759  * Get current oldest transaction in the log before we issue flush
760  * to the filesystem device. After the flush we can be sure that
761  * blocks of all older transactions are checkpointed to persistent
762  * storage and we will be safe to update journal start in the
763  * superblock with the numbers we get here.
764  */
765  update_tail =
766  jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
767 
768  write_lock(&journal->j_state_lock);
769  if (update_tail) {
770  long freed = first_block - journal->j_tail;
771 
772  if (first_block < journal->j_tail)
773  freed += journal->j_last - journal->j_first;
774  /* Update tail only if we free significant amount of space */
775  if (freed < journal->j_maxlen / 4)
776  update_tail = 0;
777  }
778  J_ASSERT(commit_transaction->t_state == T_COMMIT);
779  commit_transaction->t_state = T_COMMIT_DFLUSH;
780  write_unlock(&journal->j_state_lock);
781 
782  /*
783  * If the journal is not located on the file system device,
784  * then we must flush the file system device before we issue
785  * the commit record
786  */
787  if (commit_transaction->t_need_data_flush &&
788  (journal->j_fs_dev != journal->j_dev) &&
789  (journal->j_flags & JBD2_BARRIER))
790  blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
791 
792  /* Done it all: now write the commit record asynchronously. */
793  if (JBD2_HAS_INCOMPAT_FEATURE(journal,
795  err = journal_submit_commit_record(journal, commit_transaction,
796  &cbh, crc32_sum);
797  if (err)
798  __jbd2_journal_abort_hard(journal);
799  }
800 
801  blk_finish_plug(&plug);
802 
803  /* Lo and behold: we have just managed to send a transaction to
804  the log. Before we can commit it, wait for the IO so far to
805  complete. Control buffers being written are on the
806  transaction's t_log_list queue, and metadata buffers are on
807  the t_iobuf_list queue.
808 
809  Wait for the buffers in reverse order. That way we are
810  less likely to be woken up until all IOs have completed, and
811  so we incur less scheduling load.
812  */
813 
814  jbd_debug(3, "JBD2: commit phase 3\n");
815 
816  /*
817  * akpm: these are BJ_IO, and j_list_lock is not needed.
818  * See __journal_try_to_free_buffer.
819  */
820 wait_for_iobuf:
821  while (commit_transaction->t_iobuf_list != NULL) {
822  struct buffer_head *bh;
823 
824  jh = commit_transaction->t_iobuf_list->b_tprev;
825  bh = jh2bh(jh);
826  if (buffer_locked(bh)) {
827  wait_on_buffer(bh);
828  goto wait_for_iobuf;
829  }
830  if (cond_resched())
831  goto wait_for_iobuf;
832 
833  if (unlikely(!buffer_uptodate(bh)))
834  err = -EIO;
835 
836  clear_buffer_jwrite(bh);
837 
838  JBUFFER_TRACE(jh, "ph4: unfile after journal write");
839  jbd2_journal_unfile_buffer(journal, jh);
840 
841  /*
842  * ->t_iobuf_list should contain only dummy buffer_heads
843  * which were created by jbd2_journal_write_metadata_buffer().
844  */
845  BUFFER_TRACE(bh, "dumping temporary bh");
847  __brelse(bh);
848  J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
849  free_buffer_head(bh);
850 
851  /* We also have to unlock and free the corresponding
852  shadowed buffer */
853  jh = commit_transaction->t_shadow_list->b_tprev;
854  bh = jh2bh(jh);
855  clear_bit(BH_JWrite, &bh->b_state);
856  J_ASSERT_BH(bh, buffer_jbddirty(bh));
857 
858  /* The metadata is now released for reuse, but we need
859  to remember it against this transaction so that when
860  we finally commit, we can do any checkpointing
861  required. */
862  JBUFFER_TRACE(jh, "file as BJ_Forget");
863  jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
864  /*
865  * Wake up any transactions which were waiting for this IO to
866  * complete. The barrier must be here so that changes by
867  * jbd2_journal_file_buffer() take effect before wake_up_bit()
868  * does the waitqueue check.
869  */
870  smp_mb();
871  wake_up_bit(&bh->b_state, BH_Unshadow);
872  JBUFFER_TRACE(jh, "brelse shadowed buffer");
873  __brelse(bh);
874  }
875 
876  J_ASSERT (commit_transaction->t_shadow_list == NULL);
877 
878  jbd_debug(3, "JBD2: commit phase 4\n");
879 
880  /* Here we wait for the revoke record and descriptor record buffers */
881  wait_for_ctlbuf:
882  while (commit_transaction->t_log_list != NULL) {
883  struct buffer_head *bh;
884 
885  jh = commit_transaction->t_log_list->b_tprev;
886  bh = jh2bh(jh);
887  if (buffer_locked(bh)) {
888  wait_on_buffer(bh);
889  goto wait_for_ctlbuf;
890  }
891  if (cond_resched())
892  goto wait_for_ctlbuf;
893 
894  if (unlikely(!buffer_uptodate(bh)))
895  err = -EIO;
896 
897  BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
898  clear_buffer_jwrite(bh);
899  jbd2_journal_unfile_buffer(journal, jh);
901  __brelse(bh); /* One for getblk */
902  /* AKPM: bforget here */
903  }
904 
905  if (err)
906  jbd2_journal_abort(journal, err);
907 
908  jbd_debug(3, "JBD2: commit phase 5\n");
909  write_lock(&journal->j_state_lock);
910  J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
911  commit_transaction->t_state = T_COMMIT_JFLUSH;
912  write_unlock(&journal->j_state_lock);
913 
914  if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
916  err = journal_submit_commit_record(journal, commit_transaction,
917  &cbh, crc32_sum);
918  if (err)
919  __jbd2_journal_abort_hard(journal);
920  }
921  if (cbh)
922  err = journal_wait_on_commit_record(journal, cbh);
923  if (JBD2_HAS_INCOMPAT_FEATURE(journal,
925  journal->j_flags & JBD2_BARRIER) {
926  blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
927  }
928 
929  if (err)
930  jbd2_journal_abort(journal, err);
931 
932  /*
933  * Now disk caches for filesystem device are flushed so we are safe to
934  * erase checkpointed transactions from the log by updating journal
935  * superblock.
936  */
937  if (update_tail)
938  jbd2_update_log_tail(journal, first_tid, first_block);
939 
940  /* End of a transaction! Finally, we can do checkpoint
941  processing: any buffers committed as a result of this
942  transaction can be removed from any checkpoint list it was on
943  before. */
944 
945  jbd_debug(3, "JBD2: commit phase 6\n");
946 
947  J_ASSERT(list_empty(&commit_transaction->t_inode_list));
948  J_ASSERT(commit_transaction->t_buffers == NULL);
949  J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
950  J_ASSERT(commit_transaction->t_iobuf_list == NULL);
951  J_ASSERT(commit_transaction->t_shadow_list == NULL);
952  J_ASSERT(commit_transaction->t_log_list == NULL);
953 
954 restart_loop:
955  /*
956  * As there are other places (journal_unmap_buffer()) adding buffers
957  * to this list we have to be careful and hold the j_list_lock.
958  */
959  spin_lock(&journal->j_list_lock);
960  while (commit_transaction->t_forget) {
961  transaction_t *cp_transaction;
962  struct buffer_head *bh;
963  int try_to_free = 0;
964 
965  jh = commit_transaction->t_forget;
966  spin_unlock(&journal->j_list_lock);
967  bh = jh2bh(jh);
968  /*
969  * Get a reference so that bh cannot be freed before we are
970  * done with it.
971  */
972  get_bh(bh);
973  jbd_lock_bh_state(bh);
974  J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
975 
976  /*
977  * If there is undo-protected committed data against
978  * this buffer, then we can remove it now. If it is a
979  * buffer needing such protection, the old frozen_data
980  * field now points to a committed version of the
981  * buffer, so rotate that field to the new committed
982  * data.
983  *
984  * Otherwise, we can just throw away the frozen data now.
985  *
986  * We also know that the frozen data has already fired
987  * its triggers if they exist, so we can clear that too.
988  */
989  if (jh->b_committed_data) {
990  jbd2_free(jh->b_committed_data, bh->b_size);
991  jh->b_committed_data = NULL;
992  if (jh->b_frozen_data) {
994  jh->b_frozen_data = NULL;
995  jh->b_frozen_triggers = NULL;
996  }
997  } else if (jh->b_frozen_data) {
998  jbd2_free(jh->b_frozen_data, bh->b_size);
999  jh->b_frozen_data = NULL;
1000  jh->b_frozen_triggers = NULL;
1001  }
1002 
1003  spin_lock(&journal->j_list_lock);
1004  cp_transaction = jh->b_cp_transaction;
1005  if (cp_transaction) {
1006  JBUFFER_TRACE(jh, "remove from old cp transaction");
1007  cp_transaction->t_chp_stats.cs_dropped++;
1009  }
1010 
1011  /* Only re-checkpoint the buffer_head if it is marked
1012  * dirty. If the buffer was added to the BJ_Forget list
1013  * by jbd2_journal_forget, it may no longer be dirty and
1014  * there's no point in keeping a checkpoint record for
1015  * it. */
1016 
1017  /*
1018  * A buffer which has been freed while still being journaled by
1019  * a previous transaction.
1020  */
1021  if (buffer_freed(bh)) {
1022  /*
1023  * If the running transaction is the one containing
1024  * "add to orphan" operation (b_next_transaction !=
1025  * NULL), we have to wait for that transaction to
1026  * commit before we can really get rid of the buffer.
1027  * So just clear b_modified to not confuse transaction
1028  * credit accounting and refile the buffer to
1029  * BJ_Forget of the running transaction. If the just
1030  * committed transaction contains "add to orphan"
1031  * operation, we can completely invalidate the buffer
1032  * now. We are rather through in that since the
1033  * buffer may be still accessible when blocksize <
1034  * pagesize and it is attached to the last partial
1035  * page.
1036  */
1037  jh->b_modified = 0;
1038  if (!jh->b_next_transaction) {
1039  clear_buffer_freed(bh);
1040  clear_buffer_jbddirty(bh);
1041  clear_buffer_mapped(bh);
1042  clear_buffer_new(bh);
1043  clear_buffer_req(bh);
1044  bh->b_bdev = NULL;
1045  }
1046  }
1047 
1048  if (buffer_jbddirty(bh)) {
1049  JBUFFER_TRACE(jh, "add to new checkpointing trans");
1050  __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1051  if (is_journal_aborted(journal))
1052  clear_buffer_jbddirty(bh);
1053  } else {
1054  J_ASSERT_BH(bh, !buffer_dirty(bh));
1055  /*
1056  * The buffer on BJ_Forget list and not jbddirty means
1057  * it has been freed by this transaction and hence it
1058  * could not have been reallocated until this
1059  * transaction has committed. *BUT* it could be
1060  * reallocated once we have written all the data to
1061  * disk and before we process the buffer on BJ_Forget
1062  * list.
1063  */
1064  if (!jh->b_next_transaction)
1065  try_to_free = 1;
1066  }
1067  JBUFFER_TRACE(jh, "refile or unfile buffer");
1069  jbd_unlock_bh_state(bh);
1070  if (try_to_free)
1071  release_buffer_page(bh); /* Drops bh reference */
1072  else
1073  __brelse(bh);
1074  cond_resched_lock(&journal->j_list_lock);
1075  }
1076  spin_unlock(&journal->j_list_lock);
1077  /*
1078  * This is a bit sleazy. We use j_list_lock to protect transition
1079  * of a transaction into T_FINISHED state and calling
1080  * __jbd2_journal_drop_transaction(). Otherwise we could race with
1081  * other checkpointing code processing the transaction...
1082  */
1083  write_lock(&journal->j_state_lock);
1084  spin_lock(&journal->j_list_lock);
1085  /*
1086  * Now recheck if some buffers did not get attached to the transaction
1087  * while the lock was dropped...
1088  */
1089  if (commit_transaction->t_forget) {
1090  spin_unlock(&journal->j_list_lock);
1091  write_unlock(&journal->j_state_lock);
1092  goto restart_loop;
1093  }
1094 
1095  /* Done with this transaction! */
1096 
1097  jbd_debug(3, "JBD2: commit phase 7\n");
1098 
1099  J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1100 
1101  commit_transaction->t_start = jiffies;
1102  stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1103  commit_transaction->t_start);
1104 
1105  /*
1106  * File the transaction statistics
1107  */
1108  stats.ts_tid = commit_transaction->t_tid;
1109  stats.run.rs_handle_count =
1110  atomic_read(&commit_transaction->t_handle_count);
1111  trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1112  commit_transaction->t_tid, &stats.run);
1113 
1114  /*
1115  * Calculate overall stats
1116  */
1117  spin_lock(&journal->j_history_lock);
1118  journal->j_stats.ts_tid++;
1119  journal->j_stats.run.rs_wait += stats.run.rs_wait;
1120  journal->j_stats.run.rs_running += stats.run.rs_running;
1121  journal->j_stats.run.rs_locked += stats.run.rs_locked;
1122  journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1123  journal->j_stats.run.rs_logging += stats.run.rs_logging;
1124  journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1125  journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1126  journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1127  spin_unlock(&journal->j_history_lock);
1128 
1129  commit_transaction->t_state = T_FINISHED;
1130  J_ASSERT(commit_transaction == journal->j_committing_transaction);
1131  journal->j_commit_sequence = commit_transaction->t_tid;
1132  journal->j_committing_transaction = NULL;
1133  commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1134 
1135  /*
1136  * weight the commit time higher than the average time so we don't
1137  * react too strongly to vast changes in the commit time
1138  */
1139  if (likely(journal->j_average_commit_time))
1140  journal->j_average_commit_time = (commit_time +
1141  journal->j_average_commit_time*3) / 4;
1142  else
1143  journal->j_average_commit_time = commit_time;
1144  write_unlock(&journal->j_state_lock);
1145 
1146  if (commit_transaction->t_checkpoint_list == NULL &&
1147  commit_transaction->t_checkpoint_io_list == NULL) {
1148  __jbd2_journal_drop_transaction(journal, commit_transaction);
1149  to_free = 1;
1150  } else {
1151  if (journal->j_checkpoint_transactions == NULL) {
1152  journal->j_checkpoint_transactions = commit_transaction;
1153  commit_transaction->t_cpnext = commit_transaction;
1154  commit_transaction->t_cpprev = commit_transaction;
1155  } else {
1156  commit_transaction->t_cpnext =
1157  journal->j_checkpoint_transactions;
1158  commit_transaction->t_cpprev =
1159  commit_transaction->t_cpnext->t_cpprev;
1160  commit_transaction->t_cpnext->t_cpprev =
1161  commit_transaction;
1162  commit_transaction->t_cpprev->t_cpnext =
1163  commit_transaction;
1164  }
1165  }
1166  spin_unlock(&journal->j_list_lock);
1167 
1168  if (journal->j_commit_callback)
1169  journal->j_commit_callback(journal, commit_transaction);
1170 
1171  trace_jbd2_end_commit(journal, commit_transaction);
1172  jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1173  journal->j_commit_sequence, journal->j_tail_sequence);
1174  if (to_free)
1175  jbd2_journal_free_transaction(commit_transaction);
1176 
1177  wake_up(&journal->j_wait_done_commit);
1178 }