From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:09:44 +0800 Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may configure rseq critical section by def Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 86 +++++++++++++++++++------------ criu/cr-restore.c | 63 ++++++++++++++++++++++ criu/include/pstree.h | 1 + images/rseq.proto | 1 + 4 files changed, 119 insertions(+), 32 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index a3f8973..79387fb 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1047,13 +1047,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs) +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, + struct rseq_cs *rseq_cs, struct rseq *rseq) { int ret; - uint64_t addr; /* rseq is not registered */ - if (!rseq->rseq_abi_pointer) + if (!rseqc->rseq_abi_pointer) return 0; /* @@ -1068,22 +1068,21 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). */ - ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), - sizeof(uint64_t)); + ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), + sizeof(struct rseq)); if (ret) { - pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr, - (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t)); + pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, + (unsigned long)(rseqc->rseq_abi_pointer), sizeof(uint64_t)); return -1; } - /* (struct rseq)->rseq_cs is NULL */ - if (!addr) + if (!rseq->rseq_cs.ptr64) return 0; - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs)); + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs.ptr64), sizeof(struct rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs)); + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs.ptr64, sizeof(struct rseq_cs)); return -1; } @@ -1092,11 +1091,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str static int dump_thread_rseq(struct pstree_item *item, int i) { - struct __ptrace_rseq_configuration rseq; + struct __ptrace_rseq_configuration rseqc; RseqEntry *rseqe = NULL; int ret; CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; + struct rseq rseq; struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; @@ -1111,20 +1111,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i) if (!kdat.has_ptrace_get_rseq_conf) return 0; - ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); - if (ret != sizeof(rseq)) { + ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); + if (ret != sizeof(rseqc)) { pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); return -1; } - if (rseq.flags != 0) { + if (rseqc.flags != 0) { pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, - rseq.flags); + rseqc.flags); return -1; } - pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, - rseq.signature); + pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, + rseqc.signature); rseqe = xmalloc(sizeof(*rseqe)); if (!rseqe) @@ -1132,13 +1132,22 @@ static int dump_thread_rseq(struct pstree_item *item, int i) rseq_entry__init(rseqe); - rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; - rseqe->rseq_abi_size = rseq.rseq_abi_size; - rseqe->signature = rseq.signature; + rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; + rseqe->rseq_abi_size = rseqc.rseq_abi_size; + rseqe->signature = rseqc.signature; - if (read_rseq_cs(tid, &rseq, rseq_cs)) + if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) goto err; + rseqe->has_rseq_cs_pointer = true; + rseqe->rseq_cs_pointer = rseq.rseq_cs.ptr64; + pr_err("cs pointer %lx\n", rseqe->rseq_cs_pointer); + /* we won't save rseq_cs to the image (only pointer), + * so let's combine flags from both struct rseq and struct rseq_cs + * (kernel does the same when interpreting RSEQ_CS_FLAG_*) + */ + rseq_cs->flags |= rseq.flags; + /* save rseq entry to the image */ *rseqep = rseqe; @@ -1188,11 +1197,11 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; - /* (struct rseq)->rseq_cs is NULL */ + /* equivalent to (struct rseq)->rseq_cs is NULL */ if (!rseq_cs->start_ip) return 0; - pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + pr_debug("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, rseq_cs->version, (unsigned long)TI_IP(core)); @@ -1204,25 +1213,38 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) if (task_in_rseq(rseq_cs, TI_IP(core))) { struct pid *tid = &item->threads[i]; - pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", - tid->real); - /* * We need to fixup task instruction pointer from * the original one (which lays inside rseq critical section) - * to rseq abort handler address. + * to rseq abort handler address. But we need to look on rseq_cs->flags + * (please refer to struct rseq -> flags field description). + * Naive idea of flags support may be like... let's change instruction pointer (IP) + * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). + * But unfortunately, it doesn't work properly, because the kernel does + * clean up of rseq_cs field in the struct rseq (modifies userspace memory). + * So, we need to preserve original value of (struct rseq)->rseq_cs field in the + * image and restore it's value before releasing threads. * * It's worth to mention that we need to fixup IP in CoreEntry * (used when full dump/restore is performed) and also in * the parasite regs storage (used if --leave-running option is used, * or if dump error occured and process execution is resumed). */ - TI_IP(core) = rseq_cs->abort_ip; - if (item->pid->real == tid->real) { - compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); + if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { + pr_err("The %d task is in rseq critical section.!!! IP will be set to rseq abort handler addr\n", + tid->real); } else { - compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); + pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", + tid->real); + + TI_IP(core) = rseq_cs->abort_ip; + + if (item->pid->real == tid->real) { + compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); + } else { + compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); + } } } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b2bd044..864140f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -23,6 +23,7 @@ #include "common/compiler.h" #include "linux/mount.h" +#include "linux/rseq.h" #include "clone-noasan.h" #include "cr_options.h" @@ -779,6 +780,7 @@ static int open_cores(int pid, CoreEntry *leader_core) { int i, tpid; CoreEntry **cores = NULL; + //RseqEntry *rseqs; cores = xmalloc(sizeof(*cores) * current->nr_threads); if (!cores) @@ -812,6 +814,19 @@ static int open_cores(int pid, CoreEntry *leader_core) } } + + pr_err("item %lx\n", (uint64_t)current); + + for (i = 0; i < current->nr_threads; i++) { + ThreadCoreEntry *tc = cores[i]->thread_core; + + /* compatibility with older CRIU versions */ + if (!tc->rseq_entry) + continue; + + current->rseqe[i] = *tc->rseq_entry; + } + return 0; err: xfree(cores); @@ -868,8 +883,15 @@ static int restore_one_alive_task(int pid, CoreEntry *core) { unsigned args_len; struct task_restore_args *ta; + RseqEntry *rseqs; pr_info("Restoring resources\n"); + rseqs = shmalloc(sizeof(*rseqs) * current->nr_threads); + if (!rseqs) + return -1; + + current->rseqe = rseqs; + rst_mem_switch_to_private(); args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); @@ -1966,6 +1988,44 @@ static int attach_to_tasks(bool root_seized) return 0; } +static int restore_rseq_cs(void) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + int i; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + pid_t pid = item->threads[i].real; + + if (!item->rseqe[i].rseq_cs_pointer || !item->rseqe[i].rseq_abi_pointer) { + pr_err("item %lx rseqe %lx\n", (uint64_t)item, (uint64_t)item->rseqe); + pr_err("nothing to do with cs_pointer\n"); + continue; + } + + pr_err("restoring cs ... %lx \n", item->rseqe[i].rseq_cs_pointer); + + if (ptrace_poke_area(pid, &item->rseqe[i].rseq_cs_pointer, (void *)(item->rseqe[i].rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t))) { + pr_err("Can't restore memfd args (pid: %d)\n", pid); + return -1; + } + } + } + + return 0; +} + static int catch_tasks(bool root_seized, enum trace_flags *flag) { struct pstree_item *item; @@ -2400,6 +2460,9 @@ skip_ns_bouncing: if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); + /* just before releasing threads we have to restore rseq_cs */ + restore_rseq_cs(); + /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 458e5f9..97bef11 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -25,6 +25,7 @@ struct pstree_item { int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; + RseqEntry *rseqe; TaskKobjIdsEntry *ids; union { futex_t task_st; diff --git a/images/rseq.proto b/images/rseq.proto index be28004..45cb847 100644 --- a/images/rseq.proto +++ b/images/rseq.proto @@ -6,4 +6,5 @@ message rseq_entry { required uint64 rseq_abi_pointer = 1; required uint32 rseq_abi_size = 2; required uint32 signature = 3; + optional uint64 rseq_cs_pointer = 4; } -- 2.30.0