From e0c2a9aa1e68455dc3439e95d85cabcaff073666 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 9 Jan 2012 17:18:05 -0500 Subject: GFS2: dlm based recovery coordination This new method of managing recovery is an alternative to the previous approach of using the userland gfs_controld. - use dlm slot numbers to assign journal id's - use dlm recovery callbacks to initiate journal recovery - use a dlm lock to determine the first node to mount fs - use a dlm lock to track journals that need recovery Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) (limited to 'fs/gfs2/incore.h') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e1d3bb59945..b9422bc8e2f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -139,8 +139,45 @@ struct gfs2_bufdata { #define GDLM_STRNAME_BYTES 25 #define GDLM_LVB_SIZE 32 +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + enum { DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, }; struct lm_lockname { @@ -499,14 +536,26 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_first_done; unsigned int ls_nodir; const struct lm_lockops *ls_ops; - unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid_done; - int ls_recover_jid_status; + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ }; struct gfs2_sbd { @@ -544,6 +593,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct delayed_work sd_control_work; /* Inode Stuff */ -- cgit v1.2.3-70-g09d2 From e8ca5cc571a60339491f8c273a01093096ff8704 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 9 Jan 2012 14:40:06 -0500 Subject: GFS2: let spectator mount do read only recovery Previously, a spectator mount would not even attempt to do journal recovery for a failed node. This meant that if all mounted nodes were spectators, everyone would be stuck after a node failed, all waiting for recovery to be performed. This is unnecessary since the failed node had a clean journal. Instead, allow a spectator mount to do a partial "read only" recovery, which means it will check if the failed journal is clean, and if so, report a successful recovery. If the failed journal is not clean, it reports that journal recovery failed. This makes it work the same as a read only mount on a read only block device. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/recovery.c | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/gfs2/incore.h') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b9422bc8e2f..e5701c70f6f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -498,6 +498,7 @@ enum { SDF_NORECOVERY = 4, SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b01573b7ad9..6aacf3f230a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1078,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; - set_bit(SDF_NORECOVERY, &sdp->sd_flags); + set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index af49e8f432f..80701d1566a 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -516,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { -- cgit v1.2.3-70-g09d2 From 376d37788b56bc2800e5bd56b7a36b3544d89f97 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 9 Jan 2012 15:29:20 -0500 Subject: GFS2: fail mount if journal recovery fails If the first mounter fails to recover one of the journals during mount, the mount should fail. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/recovery.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/gfs2/incore.h') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e5701c70f6f..97742a7ea9c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -429,6 +429,7 @@ struct gfs2_jdesc { #define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; + int jd_recover_error; }; struct gfs2_statfs_change_host { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 80701d1566a..963b2d75200 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -583,6 +583,7 @@ fail_gunlock_j: fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: + jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); @@ -611,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); - return 0; + return wait ? jd->jd_recover_error : 0; } -- cgit v1.2.3-70-g09d2