see https://github.com/checkpoint-restore/criu/issues/1696 and https://github.com/checkpoint-restore/criu/pull/1706
249 lines
7.3 KiB
Diff
249 lines
7.3 KiB
Diff
From 33abfc12b973560b3d98afdbac7554b8c0542c3d Mon Sep 17 00:00:00 2001
|
|
From: bb-cat <ningyu9@huawei.com>
|
|
Date: Wed, 2 Mar 2022 15:04:54 +0800
|
|
Subject: [PATCH 13/16] cr-dump: fixup thread IP when inside rseq cs
|
|
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
|
|
|
|
---
|
|
criu/cr-dump.c | 155 +++++++++++++++++++++++++++-
|
|
criu/include/parasite.h | 2 +
|
|
criu/include/pstree.h | 1 +
|
|
3 files changed, 154 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
|
|
index 91dd08a..a3f8973 100644
|
|
--- a/criu/cr-dump.c
|
|
+++ b/criu/cr-dump.c
|
|
@@ -1047,11 +1047,58 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
|
|
return 0;
|
|
}
|
|
|
|
-static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
|
|
+static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs)
|
|
+{
|
|
+ int ret;
|
|
+ uint64_t addr;
|
|
+
|
|
+ /* rseq is not registered */
|
|
+ if (!rseq->rseq_abi_pointer)
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * We need to cover the case when victim process was inside rseq critical section
|
|
+ * at the moment when CRIU comes and seized it. We need to determine the borders
|
|
+ * of rseq critical section at first. To achieve that we need to access thread
|
|
+ * memory and read pointer to struct rseq_cs.
|
|
+ *
|
|
+ * We have two ways to access thread memory: from the parasite and using ptrace().
|
|
+ * But it this case we can't use parasite, because if victim process returns to the
|
|
+ * execution, on the kernel side __rseq_handle_notify_resume hook will be called,
|
|
+ * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq
|
|
+ * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA).
|
|
+ */
|
|
+ ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)),
|
|
+ sizeof(uint64_t));
|
|
+ if (ret) {
|
|
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr,
|
|
+ (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t));
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* (struct rseq)->rseq_cs is NULL */
|
|
+ if (!addr)
|
|
+ return 0;
|
|
+
|
|
+ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs));
|
|
+ if (ret) {
|
|
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid,
|
|
+ (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs));
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int dump_thread_rseq(struct pstree_item *item, int i)
|
|
{
|
|
struct __ptrace_rseq_configuration rseq;
|
|
RseqEntry *rseqe = NULL;
|
|
int ret;
|
|
+ CoreEntry *core = item->core[i];
|
|
+ RseqEntry **rseqep = &core->thread_core->rseq_entry;
|
|
+ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
|
|
+ pid_t tid = item->threads[i].real;
|
|
|
|
/*
|
|
* If we are here it means that rseq() syscall is supported,
|
|
@@ -1076,7 +1123,8 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
|
|
return -1;
|
|
}
|
|
|
|
- pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature);
|
|
+ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
|
|
+ rseq.signature);
|
|
|
|
rseqe = xmalloc(sizeof(*rseqe));
|
|
if (!rseqe)
|
|
@@ -1088,25 +1136,118 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
|
|
rseqe->rseq_abi_size = rseq.rseq_abi_size;
|
|
rseqe->signature = rseq.signature;
|
|
|
|
+ if (read_rseq_cs(tid, &rseq, rseq_cs))
|
|
+ goto err;
|
|
+
|
|
+ /* save rseq entry to the image */
|
|
*rseqep = rseqe;
|
|
|
|
return 0;
|
|
+
|
|
+err:
|
|
+ xfree(rseqe);
|
|
+ return -1;
|
|
}
|
|
|
|
static int dump_task_rseq(pid_t pid, struct pstree_item *item)
|
|
{
|
|
int i;
|
|
+ struct rseq_cs *thread_rseq_cs;
|
|
|
|
/* if rseq() syscall isn't supported then nothing to dump */
|
|
if (!kdat.has_rseq)
|
|
return 0;
|
|
|
|
+ thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads);
|
|
+ if (!thread_rseq_cs)
|
|
+ return -1;
|
|
+
|
|
+ dmpi(item)->thread_rseq_cs = thread_rseq_cs;
|
|
+
|
|
for (i = 0; i < item->nr_threads; i++) {
|
|
- if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry))
|
|
- return -1;
|
|
+ if (dump_thread_rseq(item, i))
|
|
+ goto free_rseq;
|
|
}
|
|
|
|
return 0;
|
|
+
|
|
+free_rseq:
|
|
+ xfree(thread_rseq_cs);
|
|
+ dmpi(item)->thread_rseq_cs = NULL;
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr)
|
|
+{
|
|
+ return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset;
|
|
+}
|
|
+
|
|
+static int fixup_thread_rseq(struct pstree_item *item, int i)
|
|
+{
|
|
+ CoreEntry *core = item->core[i];
|
|
+ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
|
|
+ pid_t tid = item->threads[i].real;
|
|
+
|
|
+ /* (struct rseq)->rseq_cs is NULL */
|
|
+ if (!rseq_cs->start_ip)
|
|
+ return 0;
|
|
+
|
|
+ pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
|
|
+ tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags,
|
|
+ rseq_cs->version, (unsigned long)TI_IP(core));
|
|
+
|
|
+ if (rseq_cs->version != 0) {
|
|
+ pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (task_in_rseq(rseq_cs, TI_IP(core))) {
|
|
+ struct pid *tid = &item->threads[i];
|
|
+
|
|
+ pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
|
|
+ tid->real);
|
|
+
|
|
+ /*
|
|
+ * We need to fixup task instruction pointer from
|
|
+ * the original one (which lays inside rseq critical section)
|
|
+ * to rseq abort handler address.
|
|
+ *
|
|
+ * It's worth to mention that we need to fixup IP in CoreEntry
|
|
+ * (used when full dump/restore is performed) and also in
|
|
+ * the parasite regs storage (used if --leave-running option is used,
|
|
+ * or if dump error occured and process execution is resumed).
|
|
+ */
|
|
+ TI_IP(core) = rseq_cs->abort_ip;
|
|
+
|
|
+ if (item->pid->real == tid->real) {
|
|
+ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
|
|
+ } else {
|
|
+ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int fixup_task_rseq(pid_t pid, struct pstree_item *item)
|
|
+{
|
|
+ int ret = 0;
|
|
+ int i;
|
|
+
|
|
+ if (!kdat.has_ptrace_get_rseq_conf)
|
|
+ return 0;
|
|
+
|
|
+ for (i = 0; i < item->nr_threads; i++) {
|
|
+ if (fixup_thread_rseq(item, i)) {
|
|
+ ret = -1;
|
|
+ goto exit;
|
|
+ }
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ xfree(dmpi(item)->thread_rseq_cs);
|
|
+ dmpi(item)->thread_rseq_cs = NULL;
|
|
+ return ret;
|
|
}
|
|
|
|
static struct proc_pid_stat pps_buf;
|
|
@@ -1409,6 +1550,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
|
|
goto err;
|
|
}
|
|
|
|
+ ret = fixup_task_rseq(pid, item);
|
|
+ if (ret) {
|
|
+ pr_err("Fixup rseq for %d failed %d\n", pid, ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
if (fault_injected(FI_DUMP_EARLY)) {
|
|
pr_info("fault: CRIU sudden detach\n");
|
|
kill(getpid(), SIGKILL);
|
|
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
|
|
index 5fde809..d2a0688 100644
|
|
--- a/criu/include/parasite.h
|
|
+++ b/criu/include/parasite.h
|
|
@@ -10,6 +10,8 @@
|
|
#include <time.h>
|
|
#include <signal.h>
|
|
|
|
+#include "linux/rseq.h"
|
|
+
|
|
#include "image.h"
|
|
#include "util-pie.h"
|
|
#include "common/lock.h"
|
|
diff --git a/criu/include/pstree.h b/criu/include/pstree.h
|
|
index c5b0fa7..458e5f9 100644
|
|
--- a/criu/include/pstree.h
|
|
+++ b/criu/include/pstree.h
|
|
@@ -63,6 +63,7 @@ struct dmp_info {
|
|
struct parasite_ctl *parasite_ctl;
|
|
struct parasite_thread_ctl **thread_ctls;
|
|
uint64_t *thread_sp;
|
|
+ struct rseq_cs *thread_rseq_cs;
|
|
|
|
/*
|
|
* Although we don't support dumping different struct creds in general,
|
|
--
|
|
2.30.0
|
|
|