From f5a7a6b0d9b6af7d46124ed3f6b3995225cb62d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: jbd2: Fix assertion failure in fs/jbd2/checkpoint.c Before we start committing a transaction, we call __journal_clean_checkpoint_list() to cleanup transaction's written-back buffers. If this call happens to remove all of them (and there were already some buffers), __journal_remove_checkpoint() will decide to free the transaction because it isn't (yet) a committing transaction and soon we fail some assertion - the transaction really isn't ready to be freed :). We change the check in __journal_remove_checkpoint() to free only a transaction in T_FINISHED state. The locking there is subtle though (as everywhere in JBD ;(). We use j_list_lock to protect the check and a subsequent call to __journal_drop_transaction() and do the same in the end of journal_commit_transaction() which is the only place where a transaction can get to T_FINISHED state. Probably I'm too paranoid here and such locking is not really necessary - checkpoint lists are processed only from log_do_checkpoint() where a transaction must be already committed to be processed or from __journal_clean_checkpoint_list() where kjournald itself calls it and thus transaction cannot change state either. Better be safe if something changes in future... Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6986f334c64..39b5cee3dd8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -867,10 +867,10 @@ restart_loop: } spin_unlock(&journal->j_list_lock); /* - * This is a bit sleazy. We borrow j_list_lock to protect - * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. - * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but - * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); -- cgit v1.2.3-70-g09d2 From 8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: jbd2: jbd2 stats through procfs The patch below updates the jbd stats patch to 2.6.20/jbd2. The initial patch was posted by Alex Tomas in December 2005 (http://marc.info/?l=linux-ext4&m=113538565128617&w=2). It provides statistics via procfs such as transaction lifetime and size. Sometimes, investigating performance problems, i find useful to have stats from jbd about transaction's lifetime, size, etc. here is a patch for review and inclusion probably. for example, stats after creation of 3M files in htree directory: [root@bob ~]# cat /proc/fs/jbd/sda/history R/C tid wait run lock flush log hndls block inlog ctime write drop close R 261 8260 2720 0 0 750 9892 8170 8187 C 259 750 0 4885 1 R 262 20 2200 10 0 770 9836 8170 8187 R 263 30 2200 10 0 3070 9812 8170 8187 R 264 0 5000 10 0 1340 0 0 0 C 261 8240 3212 4957 0 R 265 8260 1470 0 0 4640 9854 8170 8187 R 266 0 5000 10 0 1460 0 0 0 C 262 8210 2989 4868 0 R 267 8230 1490 10 0 4440 9875 8171 8188 R 268 0 5000 10 0 1260 0 0 0 C 263 7710 2937 4908 0 R 269 7730 1470 10 0 3330 9841 8170 8187 R 270 0 5000 10 0 830 0 0 0 C 265 8140 3234 4898 0 C 267 720 0 4849 1 R 271 8630 2740 20 0 740 9819 8170 8187 C 269 800 0 4214 1 R 272 40 2170 10 0 830 9716 8170 8187 R 273 40 2280 0 0 3530 9799 8170 8187 R 274 0 5000 10 0 990 0 0 0 where, R - line for transaction's life from T_RUNNING to T_FINISHED C - line for transaction's checkpointing tid - transaction's id wait - for how long we were waiting for new transaction to start (the longest period journal_start() took in this transaction) run - real transaction's lifetime (from T_RUNNING to T_LOCKED lock - how long we were waiting for all handles to close (time the transaction was in T_LOCKED) flush - how long it took to flush all data (data=ordered) log - how long it took to write the transaction to the log hndls - how many handles got to the transaction block - how many blocks got to the transaction inlog - how many blocks are written to the log (block + descriptors) ctime - how long it took to checkpoint the transaction write - how many blocks have been written during checkpointing drop - how many blocks have been dropped during checkpointing close - how many running transactions have been closed to checkpoint this one all times are in msec. [root@bob ~]# cat /proc/fs/jbd/sda/info 280 transaction, each upto 8192 blocks average: 1633ms waiting for transaction 3616ms running transaction 5ms transaction was being locked 1ms flushing data (in ordered mode) 1799ms logging transaction 11781 handles per transaction 5629 blocks per transaction 5641 logged blocks per transaction Signed-off-by: Johann Lombardi Signed-off-by: Mariusz Kozlowski Signed-off-by: Mingming Cao Signed-off-by: Eric Sandeen --- fs/jbd2/checkpoint.c | 10 +- fs/jbd2/commit.c | 49 ++++++++ fs/jbd2/journal.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/jbd2/transaction.c | 9 ++ include/linux/jbd2.h | 77 ++++++++++++ 5 files changed, 481 insertions(+), 2 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 7e958c86242..1b7f282c1ae 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) + struct buffer_head **bhs, int *batch_count, + transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; @@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; + transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); jbd2_log_start_commit(journal, tid); @@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, bhs[*batch_count] = bh; __buffer_relink_io(jh); jbd_unlock_bh_state(bh); + transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == NR_BATCH) { spin_unlock(&journal->j_list_lock); @@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; + if (transaction->t_chp_stats.cs_chp_time == 0) + transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: /* @@ -346,7 +351,8 @@ restart: retry = 1; break; } - retry = __process_buffer(journal, jh, bhs,&batch_count); + retry = __process_buffer(journal, jh, bhs, &batch_count, + transaction); if (!retry && lock_need_resched(&journal->j_list_lock)){ spin_unlock(&journal->j_list_lock); retry = 1; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 39b5cee3dd8..8749a86f417 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -290,6 +291,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, */ void jbd2_journal_commit_transaction(journal_t *journal) { + struct transaction_stats_s stats; transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; @@ -337,6 +339,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + stats.u.run.rs_wait = commit_transaction->t_max_wait; + stats.u.run.rs_locked = jiffies; + stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.u.run.rs_locked); + spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -407,6 +414,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + stats.u.run.rs_flushing = jiffies; + stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, + stats.u.run.rs_flushing); + commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -498,6 +509,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ commit_transaction->t_state = T_COMMIT; + stats.u.run.rs_logging = jiffies; + stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, + stats.u.run.rs_logging); + stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.u.run.rs_blocks_logged = 0; + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -646,6 +663,7 @@ start_journal_io: submit_bh(WRITE, bh); } cond_resched(); + stats.u.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -816,6 +834,7 @@ restart_loop: cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; __jbd2_journal_remove_checkpoint(jh); } @@ -890,6 +909,36 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_start = jiffies; + stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction for history + */ + stats.ts_type = JBD2_STATS_RUN; + stats.ts_tid = commit_transaction->t_tid; + stats.u.run.rs_handle_count = commit_transaction->t_handle_count; + spin_lock(&journal->j_history_lock); + memcpy(journal->j_history + journal->j_history_cur, &stats, + sizeof(stats)); + if (++journal->j_history_cur == journal->j_history_max) + journal->j_history_cur = 0; + + /* + * Calculate overall stats + */ + journal->j_stats.ts_tid++; + journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; + journal->j_stats.u.run.rs_running += stats.u.run.rs_running; + journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; + journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; + journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; + journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; + journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; + journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); + commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6ddc5531587..3667c91bc78 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +struct jbd2_stats_proc_session { + journal_t *journal; + struct transaction_stats_s *stats; + int start; + int max; +}; + +static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, + struct transaction_stats_s *ts, + int first) +{ + if (ts == s->stats + s->max) + ts = s->stats; + if (!first && ts == s->stats + s->start) + return NULL; + while (ts->ts_type == 0) { + ts++; + if (ts == s->stats + s->max) + ts = s->stats; + if (ts == s->stats + s->start) + return NULL; + } + return ts; + +} + +static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) +{ + struct jbd2_stats_proc_session *s = seq->private; + struct transaction_stats_s *ts; + int l = *pos; + + if (l == 0) + return SEQ_START_TOKEN; + ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); + if (!ts) + return NULL; + l--; + while (l) { + ts = jbd2_history_skip_empty(s, ++ts, 0); + if (!ts) + break; + l--; + } + return ts; +} + +static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct jbd2_stats_proc_session *s = seq->private; + struct transaction_stats_s *ts = v; + + ++*pos; + if (v == SEQ_START_TOKEN) + return jbd2_history_skip_empty(s, s->stats + s->start, 1); + else + return jbd2_history_skip_empty(s, ++ts, 0); +} + +static int jbd2_seq_history_show(struct seq_file *seq, void *v) +{ + struct transaction_stats_s *ts = v; + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " + "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", + "wait", "run", "lock", "flush", "log", "hndls", + "block", "inlog", "ctime", "write", "drop", + "close"); + return 0; + } + if (ts->ts_type == JBD2_STATS_RUN) + seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " + "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, + jiffies_to_msecs(ts->u.run.rs_wait), + jiffies_to_msecs(ts->u.run.rs_running), + jiffies_to_msecs(ts->u.run.rs_locked), + jiffies_to_msecs(ts->u.run.rs_flushing), + jiffies_to_msecs(ts->u.run.rs_logging), + ts->u.run.rs_handle_count, + ts->u.run.rs_blocks, + ts->u.run.rs_blocks_logged); + else if (ts->ts_type == JBD2_STATS_CHECKPOINT) + seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", + "C", ts->ts_tid, " ", + jiffies_to_msecs(ts->u.chp.cs_chp_time), + ts->u.chp.cs_written, ts->u.chp.cs_dropped, + ts->u.chp.cs_forced_to_close); + else + J_ASSERT(0); + return 0; +} + +static void jbd2_seq_history_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations jbd2_seq_history_ops = { + .start = jbd2_seq_history_start, + .next = jbd2_seq_history_next, + .stop = jbd2_seq_history_stop, + .show = jbd2_seq_history_show, +}; + +static int jbd2_seq_history_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE(inode)->data; + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s) * journal->j_history_max; + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, journal->j_history, size); + s->max = journal->j_history_max; + s->start = journal->j_history_cur % s->max; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_history_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_history_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static struct file_operations jbd2_seq_history_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_history_release, +}; + +static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int jbd2_seq_info_show(struct seq_file *seq, void *v) +{ + struct jbd2_stats_proc_session *s = seq->private; + + if (v != SEQ_START_TOKEN) + return 0; + seq_printf(seq, "%lu transaction, each upto %u blocks\n", + s->stats->ts_tid, + s->journal->j_max_transaction_buffers); + if (s->stats->ts_tid == 0) + return 0; + seq_printf(seq, "average: \n %ums waiting for transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums running transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); + seq_printf(seq, " %ums transaction was being locked\n", + jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); + seq_printf(seq, " %ums flushing data (in ordered mode)\n", + jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); + seq_printf(seq, " %ums logging transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %lu handles per transaction\n", + s->stats->u.run.rs_handle_count / s->stats->ts_tid); + seq_printf(seq, " %lu blocks per transaction\n", + s->stats->u.run.rs_blocks / s->stats->ts_tid); + seq_printf(seq, " %lu logged blocks per transaction\n", + s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); + return 0; +} + +static void jbd2_seq_info_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations jbd2_seq_info_ops = { + .start = jbd2_seq_info_start, + .next = jbd2_seq_info_next, + .stop = jbd2_seq_info_stop, + .show = jbd2_seq_info_show, +}; + +static int jbd2_seq_info_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE(inode)->data; + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s); + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, &journal->j_stats, size); + s->journal = journal; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_info_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static struct file_operations jbd2_seq_info_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_info_release, +}; + +static struct proc_dir_entry *proc_jbd2_stats; + +static void jbd2_stats_proc_init(journal_t *journal) +{ + char name[BDEVNAME_SIZE]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); + journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); + if (journal->j_proc_entry) { + struct proc_dir_entry *p; + p = create_proc_entry("history", S_IRUGO, + journal->j_proc_entry); + if (p) { + p->proc_fops = &jbd2_seq_history_fops; + p->data = journal; + p = create_proc_entry("info", S_IRUGO, + journal->j_proc_entry); + if (p) { + p->proc_fops = &jbd2_seq_info_fops; + p->data = journal; + } + } + } +} + +static void jbd2_stats_proc_exit(journal_t *journal) +{ + char name[BDEVNAME_SIZE]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); + remove_proc_entry("info", journal->j_proc_entry); + remove_proc_entry("history", journal->j_proc_entry); + remove_proc_entry(name, proc_jbd2_stats); +} + +static void journal_init_stats(journal_t *journal) +{ + int size; + + if (!proc_jbd2_stats) + return; + + journal->j_history_max = 100; + size = sizeof(struct transaction_stats_s) * journal->j_history_max; + journal->j_history = kzalloc(size, GFP_KERNEL); + if (!journal->j_history) { + journal->j_history_max = 0; + return; + } + spin_lock_init(&journal->j_history_lock); +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -681,6 +988,9 @@ static journal_t * journal_init_common (void) kfree(journal); goto fail; } + + journal_init_stats(journal); + return journal; fail: return NULL; @@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; + jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); J_ASSERT(bh != NULL); @@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; journal->j_blocksize = inode->i_sb->s_blocksize; + jbd2_stats_proc_init(journal); /* journal descriptor can store up to n blocks -bzzz */ n = journal->j_blocksize / sizeof(journal_block_tag_t); @@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } + if (journal->j_proc_entry) + jbd2_stats_proc_exit(journal); if (journal->j_inode) iput(journal->j_inode); if (journal->j_revoke) @@ -1900,6 +2214,28 @@ static void __exit jbd2_remove_debugfs_entry(void) #endif +#ifdef CONFIG_PROC_FS + +#define JBD2_STATS_PROC_NAME "fs/jbd2" + +static void __init jbd2_create_jbd_stats_proc_entry(void) +{ + proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); +} + +static void __exit jbd2_remove_jbd_stats_proc_entry(void) +{ + if (proc_jbd2_stats) + remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); +} + +#else + +#define jbd2_create_jbd_stats_proc_entry() do {} while (0) +#define jbd2_remove_jbd_stats_proc_entry() do {} while (0) + +#endif + struct kmem_cache *jbd2_handle_cache; static int __init journal_init_handle_cache(void) @@ -1955,6 +2291,7 @@ static int __init journal_init(void) if (ret != 0) jbd2_journal_destroy_caches(); jbd2_create_debugfs_entry(); + jbd2_create_jbd_stats_proc_entry(); return ret; } @@ -1966,6 +2303,7 @@ static void __exit journal_exit(void) printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); #endif jbd2_remove_debugfs_entry(); + jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b1fcf2b3dca..f30802aeefa 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -59,6 +59,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; + transaction->t_max_wait = 0; + transaction->t_start = jiffies; return transaction; } @@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle) int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; int ret = 0; + unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -217,6 +220,12 @@ repeat_locked: /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ + if (time_after(transaction->t_start, ts)) { + ts = jbd2_time_diff(ts, transaction->t_start); + if (ts > transaction->t_max_wait) + transaction->t_max_wait = ts; + } + handle->h_transaction = transaction; transaction->t_outstanding_credits += nblocks; transaction->t_updates++; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d861ffd4982..685640036e8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -395,6 +395,16 @@ struct handle_s }; +/* + * Some stats for checkpoint phase + */ +struct transaction_chp_stats_s { + unsigned long cs_chp_time; + unsigned long cs_forced_to_close; + unsigned long cs_written; + unsigned long cs_dropped; +}; + /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * @@ -531,6 +541,21 @@ struct transaction_s */ spinlock_t t_handle_lock; + /* + * Longest time some handle had to wait for running transaction + */ + unsigned long t_max_wait; + + /* + * When transaction started + */ + unsigned long t_start; + + /* + * Checkpointing stats [j_checkpoint_sem] + */ + struct transaction_chp_stats_s t_chp_stats; + /* * Number of outstanding updates running on this transaction * [t_handle_lock] @@ -562,6 +587,39 @@ struct transaction_s }; +struct transaction_run_stats_s { + unsigned long rs_wait; + unsigned long rs_running; + unsigned long rs_locked; + unsigned long rs_flushing; + unsigned long rs_logging; + + unsigned long rs_handle_count; + unsigned long rs_blocks; + unsigned long rs_blocks_logged; +}; + +struct transaction_stats_s { + int ts_type; + unsigned long ts_tid; + union { + struct transaction_run_stats_s run; + struct transaction_chp_stats_s chp; + } u; +}; + +#define JBD2_STATS_RUN 1 +#define JBD2_STATS_CHECKPOINT 2 + +static inline unsigned long +jbd2_time_diff(unsigned long start, unsigned long end) +{ + if (end >= start) + return end - start; + + return end + (MAX_JIFFY_OFFSET - start); +} + /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. @@ -623,6 +681,12 @@ struct transaction_s * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize * @j_last_sync_writer: most recent pid which did a synchronous write + * @j_history: Buffer storing the transactions statistics history + * @j_history_max: Maximum number of transactions in the statistics history + * @j_history_cur: Current number of transactions in the statistics history + * @j_history_lock: Protect the transactions statistics history + * @j_proc_entry: procfs entry for the jbd statistics directory + * @j_stats: Overall statistics * @j_private: An opaque pointer to fs-private information. */ @@ -814,6 +878,19 @@ struct journal_s pid_t j_last_sync_writer; + /* + * Journal statistics + */ + struct transaction_stats_s *j_history; + int j_history_max; + int j_history_cur; + /* + * Protect the transactions statistics history + */ + spinlock_t j_history_lock; + struct proc_dir_entry *j_proc_entry; + struct transaction_stats_s j_stats; + /* * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here -- cgit v1.2.3-70-g09d2 From 818d276ceb83aa9fdebb5e0a53188290312de987 Mon Sep 17 00:00:00 2001 From: Girish Shilamkar Date: Mon, 28 Jan 2008 23:58:27 -0500 Subject: ext4: Add the journal checksum feature The journal checksum feature adds two new flags i.e JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the checksum for the blocks described by the descriptor blocks. Due to checksums, writing of the commit record no longer needs to be synchronous. Now commit record can be sent to disk without waiting for descriptor blocks to be written to disk. This behavior is controlled using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be able to recover the journal with _ASYNC_COMMIT hence it is made incompat. The commit header has been extended to hold the checksum along with the type of the checksum. For recovery in pass scan checksums are verified to ensure the sanity and completeness(in case of _ASYNC_COMMIT) of every transaction. Signed-off-by: Andreas Dilger Signed-off-by: Girish Shilamkar Signed-off-by: Dave Kleikamp Signed-off-by: Mingming Cao --- Documentation/filesystems/ext4.txt | 10 ++ fs/Kconfig | 1 + fs/ext4/super.c | 25 +++++ fs/jbd2/commit.c | 198 ++++++++++++++++++++++++++++--------- fs/jbd2/journal.c | 26 +++++ fs/jbd2/recovery.c | 151 ++++++++++++++++++++++++++-- include/linux/ext4_fs.h | 3 +- include/linux/jbd2.h | 36 ++++++- 8 files changed, 388 insertions(+), 62 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6a4adcae9f9..4f329afe20e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -89,6 +89,16 @@ When mounting an ext4 filesystem, the following option are accepted: extents ext4 will use extents to address file data. The file system will no longer be mountable by ext3. +journal_checksum Enable checksumming of the journal transactions. + This will allow the recovery code in e2fsck and the + kernel to detect corruption in the kernel. It is a + compatible change and will be ignored by older kernels. + +journal_async_commit Commit block can be written to disk without waiting + for descriptor blocks. If enabled older kernels cannot + mount the device. This will enable 'journal_checksum' + internally. + journal=update Update the ext4 file system's journal to the current format. diff --git a/fs/Kconfig b/fs/Kconfig index 9656139d2e9..219ec06a8c7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -236,6 +236,7 @@ config JBD_DEBUG config JBD2 tristate + select CRC32 help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7305443e10..f7479d30735 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -869,6 +869,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, @@ -908,6 +909,8 @@ static match_table_t tokens = { {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_checksum, "journal_checksum"}, + {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -1095,6 +1098,13 @@ static int parse_options (char *options, struct super_block *sb, return 0; *journal_devnum = option; break; + case Opt_journal_checksum: + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; + case Opt_journal_async_commit: + set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; @@ -2114,6 +2124,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount4; } + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8749a86f417..da8d0eb3b7b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) return 1; } -/* Done it all: now write the commit record. We should have +/* + * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write * entirely. * * Returns 1 if the journal needs to be aborted or 0 on success */ -static int journal_write_commit_record(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) { struct journal_head *descriptor; + struct commit_header *tmp; struct buffer_head *bh; - int i, ret; + int ret; int barrier_done = 0; if (is_journal_aborted(journal)) @@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, bh = jh2bh(descriptor); - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < bh->b_size; i += 512) { - journal_header_t *tmp = (journal_header_t*)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp = (struct commit_header *)bh->b_data; + tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } - JBUFFER_TRACE(descriptor, "write commit block"); + JBUFFER_TRACE(descriptor, "submit commit block"); + lock_buffer(bh); + set_buffer_dirty(bh); - if (journal->j_flags & JBD2_BARRIER) { + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { set_buffer_ordered(bh); barrier_done = 1; } - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); + /* is it possible for another commit to fail at roughly * the same time as this one? If so, we don't want to * trust the barrier flag in the super, but instead want @@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, clear_buffer_ordered(bh); set_buffer_uptodate(bh); set_buffer_dirty(bh); - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); } - put_bh(bh); /* One for getblk() */ - jbd2_journal_put_journal_head(descriptor); + *cbh = bh; + return ret; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); - return (ret == -EIO); + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + jbd2_journal_put_journal_head(bh2jh(bh)); + + return ret; } +/* + * Wait for all submitted IO to complete. + */ +static int journal_wait_on_locked_list(journal_t *journal, + transaction_t *commit_transaction) +{ + int ret = 0; + struct journal_head *jh; + + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + spin_lock(&journal->j_list_lock); + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + __jbd2_journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + jbd2_journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + put_bh(bh); + cond_resched_lock(&journal->j_list_lock); + } + return ret; + } + static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) { int i; @@ -275,7 +350,21 @@ write_out_data: journal_do_submit_data(wbuf, bufs); } -static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + char *addr; + __u32 checksum; + + addr = kmap_atomic(page, KM_USER0); + checksum = crc32_be(crc32_sum, + (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); + kunmap_atomic(addr, KM_USER0); + + return checksum; +} + +static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); @@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_flag; int i; int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; /* * First job: lock down the current transaction and wait for @@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal_submit_data_buffers(journal, commit_transaction); /* - * Wait for all previously submitted IO to complete. + * Wait for all previously submitted IO to complete if commit + * record is to be written synchronously. */ spin_lock(&journal->j_list_lock); - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + err = journal_wait_on_locked_list(journal, + commit_transaction); - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } spin_unlock(&journal->j_list_lock); if (err) @@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* + * Compute checksum. + */ + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -672,6 +749,23 @@ start_journal_io: } } + /* Done it all: now write the commit record asynchronously. */ + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + + spin_lock(&journal->j_list_lock); + err = journal_wait_on_locked_list(journal, + commit_transaction); + spin_unlock(&journal->j_list_lock); + if (err) + __jbd2_journal_abort_hard(journal); + } + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -771,8 +865,14 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); - if (journal_write_commit_record(journal, commit_transaction)) - err = -EIO; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + err = journal_wait_on_commit_record(cbh); if (err) jbd2_journal_abort(journal, err); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3667c91bc78..59ba2494dca 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1578,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, return 1; } +/* + * jbd2_journal_clear_features () - Clear a given journal feature in the + * superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Clear a given journal feature as present on the + * superblock. + */ +void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat &= ~cpu_to_be32(compat); + sb->s_feature_ro_compat &= ~cpu_to_be32(ro); + sb->s_feature_incompat &= ~cpu_to_be32(incompat); +} +EXPORT_SYMBOL(jbd2_journal_clear_features); /** * int jbd2_journal_update_format () - Update on-disk journal structure. diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d0ce627539e..921680663fa 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include #include #include +#include #endif /* @@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag return block; } +/* + * calc_chksums calculates the checksums for the blocks described in the + * descriptor block. + */ +static int calc_chksums(journal_t *journal, struct buffer_head *bh, + unsigned long *next_log_block, __u32 *crc32_sum) +{ + int i, num_blks, err; + unsigned long io_block; + struct buffer_head *obh; + + num_blks = count_tags(journal, bh); + /* Calculate checksum of the descriptor block. */ + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); + + for (i = 0; i < num_blks; i++) { + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + printk(KERN_ERR "JBD: IO error %d recovering block " + "%lu in log\n", err, io_block); + return 1; + } else { + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } + } + return 0; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; int tag_bytes = journal_tag_bytes(journal); + __u32 crc32_sum = ~0; /* Transactional Checksums */ /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; @@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* If it is a valid descriptor block, replay it - * in pass REPLAY; otherwise, just skip over the - * blocks it describes. */ + * in pass REPLAY; if journal_checksums enabled, then + * calculate checksums in PASS_SCAN, otherwise, + * just skip over the blocks it describes. */ if (pass != PASS_REPLAY) { + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM) && + !info->end_transaction) { + if (calc_chksums(journal, bh, + &next_log_block, + &crc32_sum)) { + put_bh(bh); + break; + } + put_bh(bh); + continue; + } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - brelse(bh); + put_bh(bh); continue; } @@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal, continue; case JBD2_COMMIT_BLOCK: - /* Found an expected commit block: not much to - * do other than move on to the next sequence + /* How to differentiate between interrupted commit + * and journal corruption ? + * + * {nth transaction} + * Checksum Verification Failed + * | + * ____________________ + * | | + * async_commit sync_commit + * | | + * | GO TO NEXT "Journal Corruption" + * | TRANSACTION + * | + * {(n+1)th transanction} + * | + * _______|______________ + * | | + * Commit block found Commit block not found + * | | + * "Journal Corruption" | + * _____________|_________ + * | | + * nth trans corrupt OR nth trans + * and (n+1)th interrupted interrupted + * before commit block + * could reach the disk. + * (Cannot find the difference in above + * mentioned conditions. Hence assume + * "Interrupted Commit".) + */ + + /* Found an expected commit block: if checksums + * are present verify them in PASS_SCAN; else not + * much to do other than move on to the next sequence * number. */ + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + int chksum_err, chksum_seen; + struct commit_header *cbh = + (struct commit_header *)bh->b_data; + unsigned found_chksum = + be32_to_cpu(cbh->h_chksum[0]); + + chksum_err = chksum_seen = 0; + + if (info->end_transaction) { + printk(KERN_ERR "JBD: Transaction %u " + "found to be corrupt.\n", + next_commit_ID - 1); + brelse(bh); + break; + } + + if (crc32_sum == found_chksum && + cbh->h_chksum_type == JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) + chksum_seen = 1; + else if (!(cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0 && + !chksum_seen)) + /* + * If fs is mounted using an old kernel and then + * kernel with journal_chksum is used then we + * get a situation where the journal flag has + * checksum flag set but checksums are not + * present i.e chksum = 0, in the individual + * commit blocks. + * Hence to avoid checksum failures, in this + * situation, this extra check is added. + */ + chksum_err = 1; + + if (chksum_err) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + printk(KERN_ERR + "JBD: Transaction %u " + "found to be corrupt.\n", + next_commit_ID); + brelse(bh); + break; + } + } + crc32_sum = ~0; + } brelse(bh); next_commit_ID++; continue; @@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal, * transaction marks the end of the valid log. */ - if (pass == PASS_SCAN) - info->end_transaction = next_commit_ID; - else { + if (pass == PASS_SCAN) { + if (!info->end_transaction) + info->end_transaction = next_commit_ID; + } else { /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ if (info->end_transaction != next_commit_ID) { diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 300cc5a5adb..cd406dba0e6 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h @@ -467,7 +467,8 @@ do { \ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ - +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 685640036e8..98a2bc5d3e3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -149,6 +149,28 @@ typedef struct journal_header_s __be32 h_sequence; } journal_header_t; +/* + * Checksum types. + */ +#define JBD2_CRC32_CHKSUM 1 +#define JBD2_MD5_CHKSUM 2 +#define JBD2_SHA1_CHKSUM 3 + +#define JBD2_CRC32_CHKSUM_SIZE 4 + +#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) +/* + * Commit block header for storing transactional checksums: + */ +struct commit_header { + __be32 h_magic; + __be32 h_blocktype; + __be32 h_sequence; + unsigned char h_chksum_type; + unsigned char h_chksum_size; + unsigned char h_padding[2]; + __be32 h_chksum[JBD2_CHECKSUM_BYTES]; +}; /* * The block tag: used to describe a single buffer in the journal. @@ -242,14 +264,18 @@ typedef struct journal_superblock_s ((j)->j_format_version >= 2 && \ ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) -#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 -#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 +#define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 + +#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 +#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 +#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 /* Features known to this kernel version: */ -#define JBD2_KNOWN_COMPAT_FEATURES 0 +#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ - JBD2_FEATURE_INCOMPAT_64BIT) + JBD2_FEATURE_INCOMPAT_64BIT | \ + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) #ifdef __KERNEL__ @@ -997,6 +1023,8 @@ extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); +extern void jbd2_journal_clear_features + (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_create (journal_t *); extern int jbd2_journal_load (journal_t *journal); extern void jbd2_journal_destroy (journal_t *); -- cgit v1.2.3-70-g09d2 From 95c354fe9f7d6decc08a92aa26eb233ecc2155bf Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 30 Jan 2008 13:31:20 +0100 Subject: spinlock: lockbreak cleanup The break_lock data structure and code for spinlocks is quite nasty. Not only does it double the size of a spinlock but it changes locking to a potentially less optimal trylock. Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a __raw_spin_is_contended that uses the lock data itself to determine whether there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is not set. Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to decouple it from the spinlock implementation, and make it typesafe (rwlocks do not have any need_lockbreak sites -- why do they even get bloated up with that break_lock then?). Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/arm/Kconfig | 5 +++++ arch/ia64/Kconfig | 5 +++++ arch/m32r/Kconfig | 5 +++++ arch/mips/Kconfig | 5 +++++ arch/parisc/Kconfig | 5 +++++ arch/powerpc/Kconfig | 5 +++++ arch/sparc64/Kconfig | 5 +++++ arch/x86/Kconfig | 4 ++++ fs/jbd/checkpoint.c | 3 ++- fs/jbd/commit.c | 2 +- fs/jbd2/checkpoint.c | 3 ++- fs/jbd2/commit.c | 2 +- include/linux/sched.h | 21 +++++++-------------- include/linux/spinlock.h | 6 ++++++ include/linux/spinlock_types.h | 4 ++-- include/linux/spinlock_up.h | 2 ++ kernel/sched.c | 16 ++++++---------- kernel/spinlock.c | 3 +-- mm/memory.c | 8 +++----- 19 files changed, 72 insertions(+), 37 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index de211ac3853..77201d3f747 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -91,6 +91,11 @@ config GENERIC_IRQ_PROBE bool default y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool default y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bef47725d4a..4a81b7fb191 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -42,6 +42,11 @@ config MMU config SWIOTLB bool +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_XCHGADD_ALGORITHM bool default y diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index ab9a264cb19..f7237c5f531 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -235,6 +235,11 @@ config IRAM_SIZE # Define implied options from the CPU selection here # +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool depends on M32R diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6b0f85f02c7..4fad0a34b99 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -694,6 +694,11 @@ source "arch/mips/vr41xx/Kconfig" endmenu +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool default y diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index b8ef1787a19..2b649c46631 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -19,6 +19,11 @@ config MMU config STACK_GROWSUP def_bool y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 232c298c933..c17a194beb0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -53,6 +53,11 @@ config RWSEM_XCHGADD_ALGORITHM bool default y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config ARCH_HAS_ILOG2_U32 bool default y diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 10b212a1f9f..1e25bce0366 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -200,6 +200,11 @@ config US2E_FREQ If in doubt, say N. # Global things across all Sun machines. +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 23936301db5..db434f8171d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,10 @@ config X86_64 config X86 def_bool y +config GENERIC_LOCKBREAK + def_bool y + depends on SMP && PREEMPT + config GENERIC_TIME def_bool y diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 0f69c416eeb..a5432bbbfb8 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -347,7 +347,8 @@ restart: break; } retry = __process_buffer(journal, jh, bhs,&batch_count); - if (!retry && lock_need_resched(&journal->j_list_lock)){ + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); retry = 1; break; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 610264b99a8..31853eb65b4 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -265,7 +265,7 @@ write_out_data: put_bh(bh); } - if (lock_need_resched(&journal->j_list_lock)) { + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1b7f282c1ae..6914598022c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -353,7 +353,8 @@ restart: } retry = __process_buffer(journal, jh, bhs, &batch_count, transaction); - if (!retry && lock_need_resched(&journal->j_list_lock)){ + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); retry = 1; break; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index da8d0eb3b7b..4f302d27927 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -341,7 +341,7 @@ write_out_data: put_bh(bh); } - if (lock_need_resched(&journal->j_list_lock)) { + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 2d0546e884e..9d4797609aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1922,23 +1922,16 @@ extern int cond_resched_softirq(void); /* * Does a critical section need to be broken due to another - * task waiting?: + * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * but a general need for low latency) */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) -# define need_lockbreak(lock) ((lock)->break_lock) -#else -# define need_lockbreak(lock) 0 -#endif - -/* - * Does a critical section need to be broken due to another - * task waiting or preemption being signalled: - */ -static inline int lock_need_resched(spinlock_t *lock) +static inline int spin_needbreak(spinlock_t *lock) { - if (need_lockbreak(lock) || need_resched()) - return 1; +#ifdef CONFIG_PREEMPT + return spin_is_contended(lock); +#else return 0; +#endif } /* diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index c376f3b36c8..124449733c5 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -120,6 +120,12 @@ do { \ #define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) +#ifdef CONFIG_GENERIC_LOCKBREAK +#define spin_is_contended(lock) ((lock)->break_lock) +#else +#define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) +#endif + /** * spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index f6a3a951b79..68d88f71f1a 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -19,7 +19,7 @@ typedef struct { raw_spinlock_t raw_lock; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +#ifdef CONFIG_GENERIC_LOCKBREAK unsigned int break_lock; #endif #ifdef CONFIG_DEBUG_SPINLOCK @@ -35,7 +35,7 @@ typedef struct { typedef struct { raw_rwlock_t raw_lock; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +#ifdef CONFIG_GENERIC_LOCKBREAK unsigned int break_lock; #endif #ifdef CONFIG_DEBUG_SPINLOCK diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index ea54c4c9a4e..938234c4a99 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -64,6 +64,8 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) # define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ +#define __raw_spin_is_contended(lock) (((void)(lock), 0)) + #define __raw_read_can_lock(lock) (((void)(lock), 1)) #define __raw_write_can_lock(lock) (((void)(lock), 1)) diff --git a/kernel/sched.c b/kernel/sched.c index 524285e46fa..ba4c88088f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4945,19 +4945,15 @@ EXPORT_SYMBOL(_cond_resched); */ int cond_resched_lock(spinlock_t *lock) { + int resched = need_resched() && system_state == SYSTEM_RUNNING; int ret = 0; - if (need_lockbreak(lock)) { + if (spin_needbreak(lock) || resched) { spin_unlock(lock); - cpu_relax(); - ret = 1; - spin_lock(lock); - } - if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); + if (resched && need_resched()) + __cond_resched(); + else + cpu_relax(); ret = 1; spin_lock(lock); } diff --git a/kernel/spinlock.c b/kernel/spinlock.c index cd72424c266..ae28c824512 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock); * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ - defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) void __lockfunc _read_lock(rwlock_t *lock) { diff --git a/mm/memory.c b/mm/memory.c index 4b0144b24c1..673ebbf499c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -513,8 +513,7 @@ again: if (progress >= 32) { progress = 0; if (need_resched() || - need_lockbreak(src_ptl) || - need_lockbreak(dst_ptl)) + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { @@ -853,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || - (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { *tlbp = NULL; goto out; @@ -1768,8 +1767,7 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || - need_lockbreak(details->i_mmap_lock); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ -- cgit v1.2.3-70-g09d2