ext4: Invert lock ordering of page_lock and transaction start in delalloc
[deliverable/linux.git] / fs / jbd2 / commit.c
CommitLineData
470decc6 1/*
f7f4bccb 2 * linux/fs/jbd2/commit.c
470decc6
DK
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
f7f4bccb 18#include <linux/jbd2.h>
470decc6
DK
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
8e85fb3f 23#include <linux/jiffies.h>
818d276c 24#include <linux/crc32.h>
470decc6
DK
25
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
37}
38
39/*
87c89c23
JK
40 * When an ext4 file is truncated, it is possible that some pages are not
41 * successfully freed, because they are attached to a committing transaction.
470decc6
DK
42 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
49 *
50 * Called under lock_journal(), and possibly under journal_datalist_lock. The
51 * caller provided us with a ref against the buffer, and we drop that here.
52 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55 struct page *page;
56
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
66
67 /* OK, it's a truncated page */
68 if (TestSetPageLocked(page))
69 goto nope;
70
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
77
78nope:
79 __brelse(bh);
80}
81
818d276c
GS
82/*
83 * Done it all: now submit the commit record. We should have
470decc6
DK
84 * cleaned up our previous buffers by now, so if we are in abort
85 * mode we can now just skip the rest of the journal write
86 * entirely.
87 *
88 * Returns 1 if the journal needs to be aborted or 0 on success
89 */
818d276c
GS
90static int journal_submit_commit_record(journal_t *journal,
91 transaction_t *commit_transaction,
92 struct buffer_head **cbh,
93 __u32 crc32_sum)
470decc6
DK
94{
95 struct journal_head *descriptor;
818d276c 96 struct commit_header *tmp;
470decc6 97 struct buffer_head *bh;
818d276c 98 int ret;
470decc6 99 int barrier_done = 0;
736603ab 100 struct timespec now = current_kernel_time();
470decc6
DK
101
102 if (is_journal_aborted(journal))
103 return 0;
104
f7f4bccb 105 descriptor = jbd2_journal_get_descriptor_buffer(journal);
470decc6
DK
106 if (!descriptor)
107 return 1;
108
109 bh = jh2bh(descriptor);
110
818d276c
GS
111 tmp = (struct commit_header *)bh->b_data;
112 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
113 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
114 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
736603ab
TT
115 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
116 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
818d276c
GS
117
118 if (JBD2_HAS_COMPAT_FEATURE(journal,
119 JBD2_FEATURE_COMPAT_CHECKSUM)) {
120 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
121 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
122 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
470decc6
DK
123 }
124
818d276c
GS
125 JBUFFER_TRACE(descriptor, "submit commit block");
126 lock_buffer(bh);
c4b8e635 127 get_bh(bh);
470decc6 128 set_buffer_dirty(bh);
818d276c
GS
129 set_buffer_uptodate(bh);
130 bh->b_end_io = journal_end_buffer_io_sync;
131
132 if (journal->j_flags & JBD2_BARRIER &&
4d605179 133 !JBD2_HAS_INCOMPAT_FEATURE(journal,
818d276c 134 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
470decc6
DK
135 set_buffer_ordered(bh);
136 barrier_done = 1;
137 }
818d276c 138 ret = submit_bh(WRITE, bh);
c4e35e07
DK
139 if (barrier_done)
140 clear_buffer_ordered(bh);
818d276c 141
470decc6
DK
142 /* is it possible for another commit to fail at roughly
143 * the same time as this one? If so, we don't want to
144 * trust the barrier flag in the super, but instead want
145 * to remember if we sent a barrier request
146 */
147 if (ret == -EOPNOTSUPP && barrier_done) {
148 char b[BDEVNAME_SIZE];
149
150 printk(KERN_WARNING
151 "JBD: barrier-based sync failed on %s - "
152 "disabling barriers\n",
153 bdevname(journal->j_dev, b));
154 spin_lock(&journal->j_state_lock);
f7f4bccb 155 journal->j_flags &= ~JBD2_BARRIER;
470decc6
DK
156 spin_unlock(&journal->j_state_lock);
157
158 /* And try again, without the barrier */
034772b0 159 lock_buffer(bh);
470decc6
DK
160 set_buffer_uptodate(bh);
161 set_buffer_dirty(bh);
818d276c 162 ret = submit_bh(WRITE, bh);
470decc6 163 }
818d276c
GS
164 *cbh = bh;
165 return ret;
166}
167
168/*
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
171 */
172static int journal_wait_on_commit_record(struct buffer_head *bh)
173{
174 int ret = 0;
175
176 clear_buffer_dirty(bh);
177 wait_on_buffer(bh);
470decc6 178
818d276c
GS
179 if (unlikely(!buffer_uptodate(bh)))
180 ret = -EIO;
181 put_bh(bh); /* One for getblk() */
182 jbd2_journal_put_journal_head(bh2jh(bh));
183
184 return ret;
470decc6
DK
185}
186
c851ed54
JK
187/*
188 * Submit all the data buffers of inode associated with the transaction to
189 * disk.
190 *
191 * We are in a committing transaction. Therefore no new inode can be added to
192 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
193 * operate on from being released while we write out pages.
194 */
195static int journal_submit_inode_data_buffers(journal_t *journal,
196 transaction_t *commit_transaction)
197{
198 struct jbd2_inode *jinode;
199 int err, ret = 0;
200 struct address_space *mapping;
201
202 spin_lock(&journal->j_list_lock);
203 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
204 mapping = jinode->i_vfs_inode->i_mapping;
205 jinode->i_flags |= JI_COMMIT_RUNNING;
206 spin_unlock(&journal->j_list_lock);
207 err = filemap_fdatawrite_range(mapping, 0,
208 i_size_read(jinode->i_vfs_inode));
209 if (!ret)
210 ret = err;
211 spin_lock(&journal->j_list_lock);
212 J_ASSERT(jinode->i_transaction == commit_transaction);
213 jinode->i_flags &= ~JI_COMMIT_RUNNING;
214 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
215 }
216 spin_unlock(&journal->j_list_lock);
217 return ret;
218}
219
220/*
221 * Wait for data submitted for writeout, refile inodes to proper
222 * transaction if needed.
223 *
224 */
225static int journal_finish_inode_data_buffers(journal_t *journal,
226 transaction_t *commit_transaction)
227{
228 struct jbd2_inode *jinode, *next_i;
229 int err, ret = 0;
230
231 /* For locking, see the comment in journal_submit_inode_data_buffers() */
232 spin_lock(&journal->j_list_lock);
233 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
234 jinode->i_flags |= JI_COMMIT_RUNNING;
235 spin_unlock(&journal->j_list_lock);
236 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 jinode->i_flags &= ~JI_COMMIT_RUNNING;
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 }
243
244 /* Now refile inode to proper lists */
245 list_for_each_entry_safe(jinode, next_i,
246 &commit_transaction->t_inode_list, i_list) {
247 list_del(&jinode->i_list);
248 if (jinode->i_next_transaction) {
249 jinode->i_transaction = jinode->i_next_transaction;
250 jinode->i_next_transaction = NULL;
251 list_add(&jinode->i_list,
252 &jinode->i_transaction->t_inode_list);
253 } else {
254 jinode->i_transaction = NULL;
255 }
256 }
257 spin_unlock(&journal->j_list_lock);
258
259 return ret;
260}
261
818d276c
GS
262static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
263{
264 struct page *page = bh->b_page;
265 char *addr;
266 __u32 checksum;
267
268 addr = kmap_atomic(page, KM_USER0);
269 checksum = crc32_be(crc32_sum,
270 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
271 kunmap_atomic(addr, KM_USER0);
272
273 return checksum;
274}
275
276static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
18eba7aa 277 unsigned long long block)
b517bea1
ZB
278{
279 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
cd02ff0b 280 if (tag_bytes > JBD2_TAG_SIZE32)
b517bea1
ZB
281 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
282}
283
470decc6 284/*
f7f4bccb 285 * jbd2_journal_commit_transaction
470decc6
DK
286 *
287 * The primary function for committing a transaction to the log. This
288 * function is called by the journal thread to begin a complete commit.
289 */
f7f4bccb 290void jbd2_journal_commit_transaction(journal_t *journal)
470decc6 291{
8e85fb3f 292 struct transaction_stats_s stats;
470decc6
DK
293 transaction_t *commit_transaction;
294 struct journal_head *jh, *new_jh, *descriptor;
295 struct buffer_head **wbuf = journal->j_wbuf;
296 int bufs;
297 int flags;
298 int err;
18eba7aa 299 unsigned long long blocknr;
470decc6
DK
300 char *tagp = NULL;
301 journal_header_t *header;
302 journal_block_tag_t *tag = NULL;
303 int space_left = 0;
304 int first_tag = 0;
305 int tag_flag;
306 int i;
b517bea1 307 int tag_bytes = journal_tag_bytes(journal);
818d276c
GS
308 struct buffer_head *cbh = NULL; /* For transactional checksums */
309 __u32 crc32_sum = ~0;
470decc6
DK
310
311 /*
312 * First job: lock down the current transaction and wait for
313 * all outstanding updates to complete.
314 */
315
316#ifdef COMMIT_STATS
317 spin_lock(&journal->j_list_lock);
318 summarise_journal_usage(journal);
319 spin_unlock(&journal->j_list_lock);
320#endif
321
f7f4bccb
MC
322 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
323 if (journal->j_flags & JBD2_FLUSHED) {
470decc6 324 jbd_debug(3, "super block updated\n");
f7f4bccb 325 jbd2_journal_update_superblock(journal, 1);
470decc6
DK
326 } else {
327 jbd_debug(3, "superblock not updated\n");
328 }
329
330 J_ASSERT(journal->j_running_transaction != NULL);
331 J_ASSERT(journal->j_committing_transaction == NULL);
332
333 commit_transaction = journal->j_running_transaction;
334 J_ASSERT(commit_transaction->t_state == T_RUNNING);
335
336 jbd_debug(1, "JBD: starting commit of transaction %d\n",
337 commit_transaction->t_tid);
338
339 spin_lock(&journal->j_state_lock);
340 commit_transaction->t_state = T_LOCKED;
341
8e85fb3f
JL
342 stats.u.run.rs_wait = commit_transaction->t_max_wait;
343 stats.u.run.rs_locked = jiffies;
344 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
345 stats.u.run.rs_locked);
346
470decc6
DK
347 spin_lock(&commit_transaction->t_handle_lock);
348 while (commit_transaction->t_updates) {
349 DEFINE_WAIT(wait);
350
351 prepare_to_wait(&journal->j_wait_updates, &wait,
352 TASK_UNINTERRUPTIBLE);
353 if (commit_transaction->t_updates) {
354 spin_unlock(&commit_transaction->t_handle_lock);
355 spin_unlock(&journal->j_state_lock);
356 schedule();
357 spin_lock(&journal->j_state_lock);
358 spin_lock(&commit_transaction->t_handle_lock);
359 }
360 finish_wait(&journal->j_wait_updates, &wait);
361 }
362 spin_unlock(&commit_transaction->t_handle_lock);
363
364 J_ASSERT (commit_transaction->t_outstanding_credits <=
365 journal->j_max_transaction_buffers);
366
367 /*
368 * First thing we are allowed to do is to discard any remaining
369 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
370 * that there are no such buffers: if a large filesystem
371 * operation like a truncate needs to split itself over multiple
f7f4bccb 372 * transactions, then it may try to do a jbd2_journal_restart() while
470decc6
DK
373 * there are still BJ_Reserved buffers outstanding. These must
374 * be released cleanly from the current transaction.
375 *
376 * In this case, the filesystem must still reserve write access
377 * again before modifying the buffer in the new transaction, but
378 * we do not require it to remember exactly which old buffers it
379 * has reserved. This is consistent with the existing behaviour
f7f4bccb 380 * that multiple jbd2_journal_get_write_access() calls to the same
470decc6
DK
381 * buffer are perfectly permissable.
382 */
383 while (commit_transaction->t_reserved_list) {
384 jh = commit_transaction->t_reserved_list;
385 JBUFFER_TRACE(jh, "reserved, unused: refile");
386 /*
f7f4bccb 387 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
470decc6
DK
388 * leave undo-committed data.
389 */
390 if (jh->b_committed_data) {
391 struct buffer_head *bh = jh2bh(jh);
392
393 jbd_lock_bh_state(bh);
af1e76d6 394 jbd2_free(jh->b_committed_data, bh->b_size);
470decc6
DK
395 jh->b_committed_data = NULL;
396 jbd_unlock_bh_state(bh);
397 }
f7f4bccb 398 jbd2_journal_refile_buffer(journal, jh);
470decc6
DK
399 }
400
401 /*
402 * Now try to drop any written-back buffers from the journal's
403 * checkpoint lists. We do this *before* commit because it potentially
404 * frees some memory
405 */
406 spin_lock(&journal->j_list_lock);
f7f4bccb 407 __jbd2_journal_clean_checkpoint_list(journal);
470decc6
DK
408 spin_unlock(&journal->j_list_lock);
409
410 jbd_debug (3, "JBD: commit phase 1\n");
411
412 /*
413 * Switch to a new revoke table.
414 */
f7f4bccb 415 jbd2_journal_switch_revoke_table(journal);
470decc6 416
8e85fb3f
JL
417 stats.u.run.rs_flushing = jiffies;
418 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
419 stats.u.run.rs_flushing);
420
470decc6
DK
421 commit_transaction->t_state = T_FLUSH;
422 journal->j_committing_transaction = commit_transaction;
423 journal->j_running_transaction = NULL;
424 commit_transaction->t_log_start = journal->j_head;
425 wake_up(&journal->j_wait_transaction_locked);
426 spin_unlock(&journal->j_state_lock);
427
428 jbd_debug (3, "JBD: commit phase 2\n");
429
470decc6
DK
430 /*
431 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first.
433 */
c851ed54 434 err = journal_submit_inode_data_buffers(journal, commit_transaction);
470decc6 435 if (err)
a7fa2baf 436 jbd2_journal_abort(journal, err);
470decc6 437
f7f4bccb 438 jbd2_journal_write_revoke_records(journal, commit_transaction);
470decc6
DK
439
440 jbd_debug(3, "JBD: commit phase 2\n");
441
470decc6
DK
442 /*
443 * Way to go: we have now written out all of the data for a
444 * transaction! Now comes the tricky part: we need to write out
445 * metadata. Loop over the transaction's entire buffer list:
446 */
02c471cb 447 spin_lock(&journal->j_state_lock);
470decc6 448 commit_transaction->t_state = T_COMMIT;
02c471cb 449 spin_unlock(&journal->j_state_lock);
470decc6 450
8e85fb3f
JL
451 stats.u.run.rs_logging = jiffies;
452 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
453 stats.u.run.rs_logging);
454 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
455 stats.u.run.rs_blocks_logged = 0;
456
1dfc3220
JB
457 J_ASSERT(commit_transaction->t_nr_buffers <=
458 commit_transaction->t_outstanding_credits);
459
87c89c23 460 err = 0;
470decc6
DK
461 descriptor = NULL;
462 bufs = 0;
463 while (commit_transaction->t_buffers) {
464
465 /* Find the next buffer to be journaled... */
466
467 jh = commit_transaction->t_buffers;
468
469 /* If we're in abort mode, we just un-journal the buffer and
470 release it for background writing. */
471
472 if (is_journal_aborted(journal)) {
473 JBUFFER_TRACE(jh, "journal is aborting: refile");
f7f4bccb 474 jbd2_journal_refile_buffer(journal, jh);
470decc6
DK
475 /* If that was the last one, we need to clean up
476 * any descriptor buffers which may have been
477 * already allocated, even if we are now
478 * aborting. */
479 if (!commit_transaction->t_buffers)
480 goto start_journal_io;
481 continue;
482 }
483
484 /* Make sure we have a descriptor block in which to
485 record the metadata buffer. */
486
487 if (!descriptor) {
488 struct buffer_head *bh;
489
490 J_ASSERT (bufs == 0);
491
492 jbd_debug(4, "JBD: get descriptor\n");
493
f7f4bccb 494 descriptor = jbd2_journal_get_descriptor_buffer(journal);
470decc6 495 if (!descriptor) {
a7fa2baf 496 jbd2_journal_abort(journal, -EIO);
470decc6
DK
497 continue;
498 }
499
500 bh = jh2bh(descriptor);
501 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
502 (unsigned long long)bh->b_blocknr, bh->b_data);
503 header = (journal_header_t *)&bh->b_data[0];
f7f4bccb
MC
504 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
505 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
470decc6
DK
506 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
507
508 tagp = &bh->b_data[sizeof(journal_header_t)];
509 space_left = bh->b_size - sizeof(journal_header_t);
510 first_tag = 1;
511 set_buffer_jwrite(bh);
512 set_buffer_dirty(bh);
513 wbuf[bufs++] = bh;
514
515 /* Record it so that we can wait for IO
516 completion later */
517 BUFFER_TRACE(bh, "ph3: file as descriptor");
f7f4bccb 518 jbd2_journal_file_buffer(descriptor, commit_transaction,
470decc6
DK
519 BJ_LogCtl);
520 }
521
522 /* Where is the buffer to be written? */
523
f7f4bccb 524 err = jbd2_journal_next_log_block(journal, &blocknr);
470decc6
DK
525 /* If the block mapping failed, just abandon the buffer
526 and repeat this loop: we'll fall into the
527 refile-on-abort condition above. */
528 if (err) {
a7fa2baf 529 jbd2_journal_abort(journal, err);
470decc6
DK
530 continue;
531 }
532
533 /*
534 * start_this_handle() uses t_outstanding_credits to determine
535 * the free space in the log, but this counter is changed
f7f4bccb 536 * by jbd2_journal_next_log_block() also.
470decc6
DK
537 */
538 commit_transaction->t_outstanding_credits--;
539
540 /* Bump b_count to prevent truncate from stumbling over
541 the shadowed buffer! @@@ This can go if we ever get
542 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
543 atomic_inc(&jh2bh(jh)->b_count);
544
545 /* Make a temporary IO buffer with which to write it out
546 (this will requeue both the metadata buffer and the
547 temporary IO buffer). new_bh goes on BJ_IO*/
548
549 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
550 /*
f7f4bccb 551 * akpm: jbd2_journal_write_metadata_buffer() sets
470decc6
DK
552 * new_bh->b_transaction to commit_transaction.
553 * We need to clean this up before we release new_bh
554 * (which is of type BJ_IO)
555 */
556 JBUFFER_TRACE(jh, "ph3: write metadata");
f7f4bccb 557 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
470decc6
DK
558 jh, &new_jh, blocknr);
559 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
560 wbuf[bufs++] = jh2bh(new_jh);
561
562 /* Record the new block's tag in the current descriptor
563 buffer */
564
565 tag_flag = 0;
566 if (flags & 1)
f7f4bccb 567 tag_flag |= JBD2_FLAG_ESCAPE;
470decc6 568 if (!first_tag)
f7f4bccb 569 tag_flag |= JBD2_FLAG_SAME_UUID;
470decc6
DK
570
571 tag = (journal_block_tag_t *) tagp;
b517bea1 572 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
470decc6 573 tag->t_flags = cpu_to_be32(tag_flag);
b517bea1
ZB
574 tagp += tag_bytes;
575 space_left -= tag_bytes;
470decc6
DK
576
577 if (first_tag) {
578 memcpy (tagp, journal->j_uuid, 16);
579 tagp += 16;
580 space_left -= 16;
581 first_tag = 0;
582 }
583
584 /* If there's no more to do, or if the descriptor is full,
585 let the IO rip! */
586
587 if (bufs == journal->j_wbufsize ||
588 commit_transaction->t_buffers == NULL ||
b517bea1 589 space_left < tag_bytes + 16) {
470decc6
DK
590
591 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
592
593 /* Write an end-of-descriptor marker before
594 submitting the IOs. "tag" still points to
595 the last tag we set up. */
596
f7f4bccb 597 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
470decc6
DK
598
599start_journal_io:
600 for (i = 0; i < bufs; i++) {
601 struct buffer_head *bh = wbuf[i];
818d276c
GS
602 /*
603 * Compute checksum.
604 */
605 if (JBD2_HAS_COMPAT_FEATURE(journal,
606 JBD2_FEATURE_COMPAT_CHECKSUM)) {
607 crc32_sum =
608 jbd2_checksum_data(crc32_sum, bh);
609 }
610
470decc6
DK
611 lock_buffer(bh);
612 clear_buffer_dirty(bh);
613 set_buffer_uptodate(bh);
614 bh->b_end_io = journal_end_buffer_io_sync;
615 submit_bh(WRITE, bh);
616 }
617 cond_resched();
8e85fb3f 618 stats.u.run.rs_blocks_logged += bufs;
470decc6
DK
619
620 /* Force a new descriptor to be generated next
621 time round the loop. */
622 descriptor = NULL;
623 bufs = 0;
624 }
625 }
626
818d276c
GS
627 /* Done it all: now write the commit record asynchronously. */
628
629 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
630 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
631 err = journal_submit_commit_record(journal, commit_transaction,
632 &cbh, crc32_sum);
633 if (err)
634 __jbd2_journal_abort_hard(journal);
818d276c
GS
635 }
636
c851ed54
JK
637 /*
638 * This is the right place to wait for data buffers both for ASYNC
639 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
640 * the commit block went to disk (which happens above). If commit is
641 * SYNC, we need to wait for data buffers before we start writing
642 * commit block, which happens below in such setting.
643 */
644 err = journal_finish_inode_data_buffers(journal, commit_transaction);
645 if (err)
646 jbd2_journal_abort(journal, err);
647
470decc6
DK
648 /* Lo and behold: we have just managed to send a transaction to
649 the log. Before we can commit it, wait for the IO so far to
650 complete. Control buffers being written are on the
651 transaction's t_log_list queue, and metadata buffers are on
652 the t_iobuf_list queue.
653
654 Wait for the buffers in reverse order. That way we are
655 less likely to be woken up until all IOs have completed, and
656 so we incur less scheduling load.
657 */
658
87c89c23 659 jbd_debug(3, "JBD: commit phase 3\n");
470decc6
DK
660
661 /*
662 * akpm: these are BJ_IO, and j_list_lock is not needed.
663 * See __journal_try_to_free_buffer.
664 */
665wait_for_iobuf:
666 while (commit_transaction->t_iobuf_list != NULL) {
667 struct buffer_head *bh;
668
669 jh = commit_transaction->t_iobuf_list->b_tprev;
670 bh = jh2bh(jh);
671 if (buffer_locked(bh)) {
672 wait_on_buffer(bh);
673 goto wait_for_iobuf;
674 }
675 if (cond_resched())
676 goto wait_for_iobuf;
677
678 if (unlikely(!buffer_uptodate(bh)))
679 err = -EIO;
680
681 clear_buffer_jwrite(bh);
682
683 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
f7f4bccb 684 jbd2_journal_unfile_buffer(journal, jh);
470decc6
DK
685
686 /*
687 * ->t_iobuf_list should contain only dummy buffer_heads
f7f4bccb 688 * which were created by jbd2_journal_write_metadata_buffer().
470decc6
DK
689 */
690 BUFFER_TRACE(bh, "dumping temporary bh");
f7f4bccb 691 jbd2_journal_put_journal_head(jh);
470decc6
DK
692 __brelse(bh);
693 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
694 free_buffer_head(bh);
695
696 /* We also have to unlock and free the corresponding
697 shadowed buffer */
698 jh = commit_transaction->t_shadow_list->b_tprev;
699 bh = jh2bh(jh);
700 clear_bit(BH_JWrite, &bh->b_state);
701 J_ASSERT_BH(bh, buffer_jbddirty(bh));
702
703 /* The metadata is now released for reuse, but we need
704 to remember it against this transaction so that when
705 we finally commit, we can do any checkpointing
706 required. */
707 JBUFFER_TRACE(jh, "file as BJ_Forget");
f7f4bccb 708 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
470decc6
DK
709 /* Wake up any transactions which were waiting for this
710 IO to complete */
711 wake_up_bit(&bh->b_state, BH_Unshadow);
712 JBUFFER_TRACE(jh, "brelse shadowed buffer");
713 __brelse(bh);
714 }
715
716 J_ASSERT (commit_transaction->t_shadow_list == NULL);
717
87c89c23 718 jbd_debug(3, "JBD: commit phase 4\n");
470decc6
DK
719
720 /* Here we wait for the revoke record and descriptor record buffers */
721 wait_for_ctlbuf:
722 while (commit_transaction->t_log_list != NULL) {
723 struct buffer_head *bh;
724
725 jh = commit_transaction->t_log_list->b_tprev;
726 bh = jh2bh(jh);
727 if (buffer_locked(bh)) {
728 wait_on_buffer(bh);
729 goto wait_for_ctlbuf;
730 }
731 if (cond_resched())
732 goto wait_for_ctlbuf;
733
734 if (unlikely(!buffer_uptodate(bh)))
735 err = -EIO;
736
737 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
738 clear_buffer_jwrite(bh);
f7f4bccb
MC
739 jbd2_journal_unfile_buffer(journal, jh);
740 jbd2_journal_put_journal_head(jh);
470decc6
DK
741 __brelse(bh); /* One for getblk */
742 /* AKPM: bforget here */
743 }
744
87c89c23 745 jbd_debug(3, "JBD: commit phase 5\n");
470decc6 746
818d276c
GS
747 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
748 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
749 err = journal_submit_commit_record(journal, commit_transaction,
750 &cbh, crc32_sum);
751 if (err)
752 __jbd2_journal_abort_hard(journal);
753 }
b048d846
MC
754 if (!err && !is_journal_aborted(journal))
755 err = journal_wait_on_commit_record(cbh);
470decc6
DK
756
757 if (err)
a7fa2baf 758 jbd2_journal_abort(journal, err);
470decc6
DK
759
760 /* End of a transaction! Finally, we can do checkpoint
761 processing: any buffers committed as a result of this
762 transaction can be removed from any checkpoint list it was on
763 before. */
764
87c89c23 765 jbd_debug(3, "JBD: commit phase 6\n");
470decc6 766
c851ed54 767 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
470decc6
DK
768 J_ASSERT(commit_transaction->t_buffers == NULL);
769 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
770 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
771 J_ASSERT(commit_transaction->t_shadow_list == NULL);
772 J_ASSERT(commit_transaction->t_log_list == NULL);
773
774restart_loop:
775 /*
776 * As there are other places (journal_unmap_buffer()) adding buffers
777 * to this list we have to be careful and hold the j_list_lock.
778 */
779 spin_lock(&journal->j_list_lock);
780 while (commit_transaction->t_forget) {
781 transaction_t *cp_transaction;
782 struct buffer_head *bh;
783
784 jh = commit_transaction->t_forget;
785 spin_unlock(&journal->j_list_lock);
786 bh = jh2bh(jh);
787 jbd_lock_bh_state(bh);
788 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
789 jh->b_transaction == journal->j_running_transaction);
790
791 /*
792 * If there is undo-protected committed data against
793 * this buffer, then we can remove it now. If it is a
794 * buffer needing such protection, the old frozen_data
795 * field now points to a committed version of the
796 * buffer, so rotate that field to the new committed
797 * data.
798 *
799 * Otherwise, we can just throw away the frozen data now.
800 */
801 if (jh->b_committed_data) {
af1e76d6 802 jbd2_free(jh->b_committed_data, bh->b_size);
470decc6
DK
803 jh->b_committed_data = NULL;
804 if (jh->b_frozen_data) {
805 jh->b_committed_data = jh->b_frozen_data;
806 jh->b_frozen_data = NULL;
807 }
808 } else if (jh->b_frozen_data) {
af1e76d6 809 jbd2_free(jh->b_frozen_data, bh->b_size);
470decc6
DK
810 jh->b_frozen_data = NULL;
811 }
812
813 spin_lock(&journal->j_list_lock);
814 cp_transaction = jh->b_cp_transaction;
815 if (cp_transaction) {
816 JBUFFER_TRACE(jh, "remove from old cp transaction");
8e85fb3f 817 cp_transaction->t_chp_stats.cs_dropped++;
f7f4bccb 818 __jbd2_journal_remove_checkpoint(jh);
470decc6
DK
819 }
820
821 /* Only re-checkpoint the buffer_head if it is marked
822 * dirty. If the buffer was added to the BJ_Forget list
f7f4bccb 823 * by jbd2_journal_forget, it may no longer be dirty and
470decc6
DK
824 * there's no point in keeping a checkpoint record for
825 * it. */
826
827 /* A buffer which has been freed while still being
828 * journaled by a previous transaction may end up still
829 * being dirty here, but we want to avoid writing back
830 * that buffer in the future now that the last use has
831 * been committed. That's not only a performance gain,
832 * it also stops aliasing problems if the buffer is left
833 * behind for writeback and gets reallocated for another
834 * use in a different page. */
835 if (buffer_freed(bh)) {
836 clear_buffer_freed(bh);
837 clear_buffer_jbddirty(bh);
838 }
839
840 if (buffer_jbddirty(bh)) {
841 JBUFFER_TRACE(jh, "add to new checkpointing trans");
f7f4bccb 842 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
470decc6 843 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
f7f4bccb 844 __jbd2_journal_refile_buffer(jh);
470decc6
DK
845 jbd_unlock_bh_state(bh);
846 } else {
847 J_ASSERT_BH(bh, !buffer_dirty(bh));
848 /* The buffer on BJ_Forget list and not jbddirty means
849 * it has been freed by this transaction and hence it
850 * could not have been reallocated until this
851 * transaction has committed. *BUT* it could be
852 * reallocated once we have written all the data to
853 * disk and before we process the buffer on BJ_Forget
854 * list. */
855 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
f7f4bccb 856 __jbd2_journal_refile_buffer(jh);
470decc6
DK
857 if (!jh->b_transaction) {
858 jbd_unlock_bh_state(bh);
859 /* needs a brelse */
f7f4bccb 860 jbd2_journal_remove_journal_head(bh);
470decc6
DK
861 release_buffer_page(bh);
862 } else
863 jbd_unlock_bh_state(bh);
864 }
865 cond_resched_lock(&journal->j_list_lock);
866 }
867 spin_unlock(&journal->j_list_lock);
868 /*
f5a7a6b0
JK
869 * This is a bit sleazy. We use j_list_lock to protect transition
870 * of a transaction into T_FINISHED state and calling
871 * __jbd2_journal_drop_transaction(). Otherwise we could race with
872 * other checkpointing code processing the transaction...
470decc6
DK
873 */
874 spin_lock(&journal->j_state_lock);
875 spin_lock(&journal->j_list_lock);
876 /*
877 * Now recheck if some buffers did not get attached to the transaction
878 * while the lock was dropped...
879 */
880 if (commit_transaction->t_forget) {
881 spin_unlock(&journal->j_list_lock);
882 spin_unlock(&journal->j_state_lock);
883 goto restart_loop;
884 }
885
886 /* Done with this transaction! */
887
87c89c23 888 jbd_debug(3, "JBD: commit phase 7\n");
470decc6
DK
889
890 J_ASSERT(commit_transaction->t_state == T_COMMIT);
891
8e85fb3f
JL
892 commit_transaction->t_start = jiffies;
893 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
894 commit_transaction->t_start);
895
896 /*
897 * File the transaction for history
898 */
899 stats.ts_type = JBD2_STATS_RUN;
900 stats.ts_tid = commit_transaction->t_tid;
901 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
902 spin_lock(&journal->j_history_lock);
903 memcpy(journal->j_history + journal->j_history_cur, &stats,
904 sizeof(stats));
905 if (++journal->j_history_cur == journal->j_history_max)
906 journal->j_history_cur = 0;
907
908 /*
909 * Calculate overall stats
910 */
911 journal->j_stats.ts_tid++;
912 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
913 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
914 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
915 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
916 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
917 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
918 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
919 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
920 spin_unlock(&journal->j_history_lock);
921
470decc6
DK
922 commit_transaction->t_state = T_FINISHED;
923 J_ASSERT(commit_transaction == journal->j_committing_transaction);
924 journal->j_commit_sequence = commit_transaction->t_tid;
925 journal->j_committing_transaction = NULL;
926 spin_unlock(&journal->j_state_lock);
927
f89b7795
JK
928 if (commit_transaction->t_checkpoint_list == NULL &&
929 commit_transaction->t_checkpoint_io_list == NULL) {
f7f4bccb 930 __jbd2_journal_drop_transaction(journal, commit_transaction);
470decc6
DK
931 } else {
932 if (journal->j_checkpoint_transactions == NULL) {
933 journal->j_checkpoint_transactions = commit_transaction;
934 commit_transaction->t_cpnext = commit_transaction;
935 commit_transaction->t_cpprev = commit_transaction;
936 } else {
937 commit_transaction->t_cpnext =
938 journal->j_checkpoint_transactions;
939 commit_transaction->t_cpprev =
940 commit_transaction->t_cpnext->t_cpprev;
941 commit_transaction->t_cpnext->t_cpprev =
942 commit_transaction;
943 commit_transaction->t_cpprev->t_cpnext =
944 commit_transaction;
945 }
946 }
947 spin_unlock(&journal->j_list_lock);
948
949 jbd_debug(1, "JBD: commit %d complete, head %d\n",
950 journal->j_commit_sequence, journal->j_tail_sequence);
951
952 wake_up(&journal->j_wait_done_commit);
953}
This page took 0.292883 seconds and 5 git commands to generate.