91 lines
2.7 KiB
Diff
91 lines
2.7 KiB
Diff
From 79ba1e15d80eba3aff4396f44629eb8960722d36 Mon Sep 17 00:00:00 2001
|
|
From: Srikanth C S <srikanth.c.s@oracle.com>
|
|
Date: Tue, 13 Dec 2022 22:45:43 +0530
|
|
Subject: [PATCH] fsck.xfs: mount/umount xfs fs to replay log before running
|
|
xfs_repair
|
|
|
|
After a recent data center crash, we had to recover root filesystems
|
|
on several thousands of VMs via a boot time fsck. Since these
|
|
machines are remotely manageable, support can inject the kernel
|
|
command line with 'fsck.mode=force fsck.repair=yes' to kick off
|
|
xfs_repair if the machine won't come up or if they suspect there
|
|
might be deeper issues with latent errors in the fs metadata, which
|
|
is what they did to try to get everyone running ASAP while
|
|
anticipating any future problems. But, fsck.xfs does not address the
|
|
journal replay in case of a crash.
|
|
|
|
fsck.xfs does xfs_repair -e if fsck.mode=force is set. It is
|
|
possible that when the machine crashes, the fs is in inconsistent
|
|
state with the journal log not yet replayed. This can drop the machine
|
|
into the rescue shell because xfs_fsck.sh does not know how to clean the
|
|
log. Since the administrator told us to force repairs, address the
|
|
deficiency by cleaning the log and rerunning xfs_repair.
|
|
|
|
Run xfs_repair -e when fsck.mode=force and repair=auto or yes.
|
|
Replay the logs only if fsck.mode=force and fsck.repair=yes. For
|
|
other option -fa and -f drop to the rescue shell if repair detects
|
|
any corruptions.
|
|
|
|
Signed-off-by: Srikanth C S <srikanth.c.s@oracle.com>
|
|
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
|
|
Signed-off-by: Carlos Maiolino <cem@kernel.org>
|
|
---
|
|
fsck/xfs_fsck.sh | 31 +++++++++++++++++++++++++++++--
|
|
1 file changed, 29 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fsck/xfs_fsck.sh b/fsck/xfs_fsck.sh
|
|
index 6af0f22..62a1e0b 100755
|
|
--- a/fsck/xfs_fsck.sh
|
|
+++ b/fsck/xfs_fsck.sh
|
|
@@ -31,10 +31,12 @@ repair2fsck_code() {
|
|
|
|
AUTO=false
|
|
FORCE=false
|
|
+REPAIR=false
|
|
while getopts ":aApyf" c
|
|
do
|
|
case $c in
|
|
- a|A|p|y) AUTO=true;;
|
|
+ a|A|p) AUTO=true;;
|
|
+ y) REPAIR=true;;
|
|
f) FORCE=true;;
|
|
esac
|
|
done
|
|
@@ -64,7 +66,32 @@ fi
|
|
|
|
if $FORCE; then
|
|
xfs_repair -e $DEV
|
|
- repair2fsck_code $?
|
|
+ error=$?
|
|
+ if [ $error -eq 2 ] && [ $REPAIR = true ]; then
|
|
+ echo "Replaying log for $DEV"
|
|
+ mkdir -p /tmp/repair_mnt || exit 1
|
|
+ for x in $(cat /proc/cmdline); do
|
|
+ case $x in
|
|
+ root=*)
|
|
+ ROOT="${x#root=}"
|
|
+ ;;
|
|
+ rootflags=*)
|
|
+ ROOTFLAGS="-o ${x#rootflags=}"
|
|
+ ;;
|
|
+ esac
|
|
+ done
|
|
+ test -b "$ROOT" || ROOT=$(blkid -t "$ROOT" -o device)
|
|
+ if [ $(basename $DEV) = $(basename $ROOT) ]; then
|
|
+ mount $DEV /tmp/repair_mnt $ROOTFLAGS || exit 1
|
|
+ else
|
|
+ mount $DEV /tmp/repair_mnt || exit 1
|
|
+ fi
|
|
+ umount /tmp/repair_mnt
|
|
+ xfs_repair -e $DEV
|
|
+ error=$?
|
|
+ rm -d /tmp/repair_mnt
|
|
+ fi
|
|
+ repair2fsck_code $error
|
|
exit $?
|
|
fi
|
|
|
|
--
|
|
1.8.3.1
|
|
|