see https://github.com/checkpoint-restore/criu/issues/1696 and https://github.com/checkpoint-restore/criu/pull/1706
331 lines
11 KiB
Diff
331 lines
11 KiB
Diff
From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001
|
|
From: bb-cat <ningyu9@huawei.com>
|
|
Date: Wed, 2 Mar 2022 15:09:44 +0800
|
|
Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may
|
|
configure rseq critical section by def
|
|
|
|
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
|
|
---
|
|
criu/cr-dump.c | 86 +++++++++++++++++++------------
|
|
criu/cr-restore.c | 63 ++++++++++++++++++++++
|
|
criu/include/pstree.h | 1 +
|
|
images/rseq.proto | 1 +
|
|
4 files changed, 119 insertions(+), 32 deletions(-)
|
|
|
|
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
|
|
index a3f8973..79387fb 100644
|
|
--- a/criu/cr-dump.c
|
|
+++ b/criu/cr-dump.c
|
|
@@ -1047,13 +1047,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
|
|
return 0;
|
|
}
|
|
|
|
-static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs)
|
|
+static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc,
|
|
+ struct rseq_cs *rseq_cs, struct rseq *rseq)
|
|
{
|
|
int ret;
|
|
- uint64_t addr;
|
|
|
|
/* rseq is not registered */
|
|
- if (!rseq->rseq_abi_pointer)
|
|
+ if (!rseqc->rseq_abi_pointer)
|
|
return 0;
|
|
|
|
/*
|
|
@@ -1068,22 +1068,21 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
|
|
* then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq
|
|
* will be cleared. So, let's use ptrace(PTRACE_PEEKDATA).
|
|
*/
|
|
- ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)),
|
|
- sizeof(uint64_t));
|
|
+ ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer),
|
|
+ sizeof(struct rseq));
|
|
if (ret) {
|
|
- pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr,
|
|
- (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t));
|
|
+ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq,
|
|
+ (unsigned long)(rseqc->rseq_abi_pointer), sizeof(uint64_t));
|
|
return -1;
|
|
}
|
|
|
|
- /* (struct rseq)->rseq_cs is NULL */
|
|
- if (!addr)
|
|
+ if (!rseq->rseq_cs.ptr64)
|
|
return 0;
|
|
|
|
- ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs));
|
|
+ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs.ptr64), sizeof(struct rseq_cs));
|
|
if (ret) {
|
|
pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid,
|
|
- (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs));
|
|
+ (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs.ptr64, sizeof(struct rseq_cs));
|
|
return -1;
|
|
}
|
|
|
|
@@ -1092,11 +1091,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
|
|
|
|
static int dump_thread_rseq(struct pstree_item *item, int i)
|
|
{
|
|
- struct __ptrace_rseq_configuration rseq;
|
|
+ struct __ptrace_rseq_configuration rseqc;
|
|
RseqEntry *rseqe = NULL;
|
|
int ret;
|
|
CoreEntry *core = item->core[i];
|
|
RseqEntry **rseqep = &core->thread_core->rseq_entry;
|
|
+ struct rseq rseq;
|
|
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
|
|
pid_t tid = item->threads[i].real;
|
|
|
|
@@ -1111,20 +1111,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
|
|
if (!kdat.has_ptrace_get_rseq_conf)
|
|
return 0;
|
|
|
|
- ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
|
|
- if (ret != sizeof(rseq)) {
|
|
+ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc);
|
|
+ if (ret != sizeof(rseqc)) {
|
|
pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
|
|
return -1;
|
|
}
|
|
|
|
- if (rseq.flags != 0) {
|
|
+ if (rseqc.flags != 0) {
|
|
pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
|
|
- rseq.flags);
|
|
+ rseqc.flags);
|
|
return -1;
|
|
}
|
|
|
|
- pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
|
|
- rseq.signature);
|
|
+ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer,
|
|
+ rseqc.signature);
|
|
|
|
rseqe = xmalloc(sizeof(*rseqe));
|
|
if (!rseqe)
|
|
@@ -1132,13 +1132,22 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
|
|
|
|
rseq_entry__init(rseqe);
|
|
|
|
- rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
|
|
- rseqe->rseq_abi_size = rseq.rseq_abi_size;
|
|
- rseqe->signature = rseq.signature;
|
|
+ rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer;
|
|
+ rseqe->rseq_abi_size = rseqc.rseq_abi_size;
|
|
+ rseqe->signature = rseqc.signature;
|
|
|
|
- if (read_rseq_cs(tid, &rseq, rseq_cs))
|
|
+ if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq))
|
|
goto err;
|
|
|
|
+ rseqe->has_rseq_cs_pointer = true;
|
|
+ rseqe->rseq_cs_pointer = rseq.rseq_cs.ptr64;
|
|
+ pr_err("cs pointer %lx\n", rseqe->rseq_cs_pointer);
|
|
+ /* we won't save rseq_cs to the image (only pointer),
|
|
+ * so let's combine flags from both struct rseq and struct rseq_cs
|
|
+ * (kernel does the same when interpreting RSEQ_CS_FLAG_*)
|
|
+ */
|
|
+ rseq_cs->flags |= rseq.flags;
|
|
+
|
|
/* save rseq entry to the image */
|
|
*rseqep = rseqe;
|
|
|
|
@@ -1188,11 +1197,11 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
|
|
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
|
|
pid_t tid = item->threads[i].real;
|
|
|
|
- /* (struct rseq)->rseq_cs is NULL */
|
|
+ /* equivalent to (struct rseq)->rseq_cs is NULL */
|
|
if (!rseq_cs->start_ip)
|
|
return 0;
|
|
|
|
- pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
|
|
+ pr_debug("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
|
|
tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags,
|
|
rseq_cs->version, (unsigned long)TI_IP(core));
|
|
|
|
@@ -1204,25 +1213,38 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
|
|
if (task_in_rseq(rseq_cs, TI_IP(core))) {
|
|
struct pid *tid = &item->threads[i];
|
|
|
|
- pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
|
|
- tid->real);
|
|
-
|
|
/*
|
|
* We need to fixup task instruction pointer from
|
|
* the original one (which lays inside rseq critical section)
|
|
- * to rseq abort handler address.
|
|
+ * to rseq abort handler address. But we need to look on rseq_cs->flags
|
|
+ * (please refer to struct rseq -> flags field description).
|
|
+ * Naive idea of flags support may be like... let's change instruction pointer (IP)
|
|
+ * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL).
|
|
+ * But unfortunately, it doesn't work properly, because the kernel does
|
|
+ * clean up of rseq_cs field in the struct rseq (modifies userspace memory).
|
|
+ * So, we need to preserve original value of (struct rseq)->rseq_cs field in the
|
|
+ * image and restore it's value before releasing threads.
|
|
*
|
|
* It's worth to mention that we need to fixup IP in CoreEntry
|
|
* (used when full dump/restore is performed) and also in
|
|
* the parasite regs storage (used if --leave-running option is used,
|
|
* or if dump error occured and process execution is resumed).
|
|
*/
|
|
- TI_IP(core) = rseq_cs->abort_ip;
|
|
|
|
- if (item->pid->real == tid->real) {
|
|
- compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
|
|
+ if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) {
|
|
+ pr_err("The %d task is in rseq critical section.!!! IP will be set to rseq abort handler addr\n",
|
|
+ tid->real);
|
|
} else {
|
|
- compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
|
|
+ pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
|
|
+ tid->real);
|
|
+
|
|
+ TI_IP(core) = rseq_cs->abort_ip;
|
|
+
|
|
+ if (item->pid->real == tid->real) {
|
|
+ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
|
|
+ } else {
|
|
+ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
|
|
+ }
|
|
}
|
|
}
|
|
|
|
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
|
|
index b2bd044..864140f 100644
|
|
--- a/criu/cr-restore.c
|
|
+++ b/criu/cr-restore.c
|
|
@@ -23,6 +23,7 @@
|
|
#include "common/compiler.h"
|
|
|
|
#include "linux/mount.h"
|
|
+#include "linux/rseq.h"
|
|
|
|
#include "clone-noasan.h"
|
|
#include "cr_options.h"
|
|
@@ -779,6 +780,7 @@ static int open_cores(int pid, CoreEntry *leader_core)
|
|
{
|
|
int i, tpid;
|
|
CoreEntry **cores = NULL;
|
|
+ //RseqEntry *rseqs;
|
|
|
|
cores = xmalloc(sizeof(*cores) * current->nr_threads);
|
|
if (!cores)
|
|
@@ -812,6 +814,19 @@ static int open_cores(int pid, CoreEntry *leader_core)
|
|
}
|
|
}
|
|
|
|
+
|
|
+ pr_err("item %lx\n", (uint64_t)current);
|
|
+
|
|
+ for (i = 0; i < current->nr_threads; i++) {
|
|
+ ThreadCoreEntry *tc = cores[i]->thread_core;
|
|
+
|
|
+ /* compatibility with older CRIU versions */
|
|
+ if (!tc->rseq_entry)
|
|
+ continue;
|
|
+
|
|
+ current->rseqe[i] = *tc->rseq_entry;
|
|
+ }
|
|
+
|
|
return 0;
|
|
err:
|
|
xfree(cores);
|
|
@@ -868,8 +883,15 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
|
|
{
|
|
unsigned args_len;
|
|
struct task_restore_args *ta;
|
|
+ RseqEntry *rseqs;
|
|
pr_info("Restoring resources\n");
|
|
|
|
+ rseqs = shmalloc(sizeof(*rseqs) * current->nr_threads);
|
|
+ if (!rseqs)
|
|
+ return -1;
|
|
+
|
|
+ current->rseqe = rseqs;
|
|
+
|
|
rst_mem_switch_to_private();
|
|
|
|
args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size());
|
|
@@ -1966,6 +1988,44 @@ static int attach_to_tasks(bool root_seized)
|
|
return 0;
|
|
}
|
|
|
|
+static int restore_rseq_cs(void)
|
|
+{
|
|
+ struct pstree_item *item;
|
|
+
|
|
+ for_each_pstree_item(item) {
|
|
+ int i;
|
|
+
|
|
+ if (!task_alive(item))
|
|
+ continue;
|
|
+
|
|
+ if (item->nr_threads == 1) {
|
|
+ item->threads[0].real = item->pid->real;
|
|
+ } else {
|
|
+ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads))
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < item->nr_threads; i++) {
|
|
+ pid_t pid = item->threads[i].real;
|
|
+
|
|
+ if (!item->rseqe[i].rseq_cs_pointer || !item->rseqe[i].rseq_abi_pointer) {
|
|
+ pr_err("item %lx rseqe %lx\n", (uint64_t)item, (uint64_t)item->rseqe);
|
|
+ pr_err("nothing to do with cs_pointer\n");
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ pr_err("restoring cs ... %lx \n", item->rseqe[i].rseq_cs_pointer);
|
|
+
|
|
+ if (ptrace_poke_area(pid, &item->rseqe[i].rseq_cs_pointer, (void *)(item->rseqe[i].rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t))) {
|
|
+ pr_err("Can't restore memfd args (pid: %d)\n", pid);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static int catch_tasks(bool root_seized, enum trace_flags *flag)
|
|
{
|
|
struct pstree_item *item;
|
|
@@ -2400,6 +2460,9 @@ skip_ns_bouncing:
|
|
if (restore_freezer_state())
|
|
pr_err("Unable to restore freezer state\n");
|
|
|
|
+ /* just before releasing threads we have to restore rseq_cs */
|
|
+ restore_rseq_cs();
|
|
+
|
|
/* Detaches from processes and they continue run through sigreturn. */
|
|
if (finalize_restore_detach())
|
|
goto out_kill_network_unlocked;
|
|
diff --git a/criu/include/pstree.h b/criu/include/pstree.h
|
|
index 458e5f9..97bef11 100644
|
|
--- a/criu/include/pstree.h
|
|
+++ b/criu/include/pstree.h
|
|
@@ -25,6 +25,7 @@ struct pstree_item {
|
|
int nr_threads; /* number of threads */
|
|
struct pid *threads; /* array of threads */
|
|
CoreEntry **core;
|
|
+ RseqEntry *rseqe;
|
|
TaskKobjIdsEntry *ids;
|
|
union {
|
|
futex_t task_st;
|
|
diff --git a/images/rseq.proto b/images/rseq.proto
|
|
index be28004..45cb847 100644
|
|
--- a/images/rseq.proto
|
|
+++ b/images/rseq.proto
|
|
@@ -6,4 +6,5 @@ message rseq_entry {
|
|
required uint64 rseq_abi_pointer = 1;
|
|
required uint32 rseq_abi_size = 2;
|
|
required uint32 signature = 3;
|
|
+ optional uint64 rseq_cs_pointer = 4;
|
|
}
|
|
--
|
|
2.30.0
|
|
|