upstream backport: - 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch * commit id: 2cb1156 feature: - 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch * support checkpoint/restore cpu affinity - 0006-criu-add-pin-memory-method.patch * support kernel feature pin memory option Signed-off-by: fu.lin <fulin10@huawei.com>
269 lines
7.6 KiB
Diff
269 lines
7.6 KiB
Diff
From 4c11832330e6c7b924b96c7ea70c14025fe0d970 Mon Sep 17 00:00:00 2001
|
|
From: "fu.lin" <fulin10@huawei.com>
|
|
Date: Tue, 13 Apr 2021 14:10:23 +0800
|
|
Subject: [PATCH 6/6] criu: add pin memory method
|
|
|
|
We can use the checkpoint and restore in userspace method to dump
|
|
and restore tasks when updating the kernel. Currently, criu needs
|
|
dump all memory data of tasks to files. When the memory size is
|
|
very large (large than 1GiB), the cost time of the dumping data
|
|
will be very long (more than 1 min).
|
|
|
|
We can pin the memory data of tasks and collect the corresponding
|
|
physical pages mapping info in checkpoint process, and remap the
|
|
physical pages to restore tasks in restore process.
|
|
|
|
Signed-off-by: Jingxian He <hejingxian@huawei.com>
|
|
---
|
|
criu/config.c | 1 +
|
|
criu/cr-restore.c | 5 +++
|
|
criu/include/cr_options.h | 1 +
|
|
criu/include/restorer.h | 24 ++++++++++++
|
|
criu/mem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++-
|
|
criu/pie/restorer.c | 21 ++++++++++-
|
|
6 files changed, 146 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/criu/config.c b/criu/config.c
|
|
index 5a53256..61b81fa 100644
|
|
--- a/criu/config.c
|
|
+++ b/criu/config.c
|
|
@@ -542,6 +542,7 @@ int parse_options(int argc, char **argv, bool *usage_error,
|
|
{ "pre-dump-mode", required_argument, 0, 1097},
|
|
{ "file-validation", required_argument, 0, 1098 },
|
|
BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity),
|
|
+ BOOL_OPT("pin-memory", &opts.pin_memory),
|
|
{ },
|
|
};
|
|
|
|
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
|
|
index da2e53d..ff41976 100644
|
|
--- a/criu/cr-restore.c
|
|
+++ b/criu/cr-restore.c
|
|
@@ -3866,6 +3866,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
|
|
task_args->clone_restore_fn,
|
|
task_args->thread_args);
|
|
|
|
+ if (opts.pin_memory)
|
|
+ task_args->pin_memory = true;
|
|
+ else
|
|
+ task_args->pin_memory = false;
|
|
+
|
|
/*
|
|
* An indirect call to task_restore, note it never returns
|
|
* and restoring core is extremely destructive.
|
|
diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
|
|
index fda54a4..a4dc5b8 100644
|
|
--- a/criu/include/cr_options.h
|
|
+++ b/criu/include/cr_options.h
|
|
@@ -176,6 +176,7 @@ struct cr_options {
|
|
int file_validation_method;
|
|
/* restore cpu affinity */
|
|
int with_cpu_affinity;
|
|
+ int pin_memory;
|
|
};
|
|
|
|
extern struct cr_options opts;
|
|
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
|
|
index bd6ef6a..fc37e6d 100644
|
|
--- a/criu/include/restorer.h
|
|
+++ b/criu/include/restorer.h
|
|
@@ -225,6 +225,7 @@ struct task_restore_args {
|
|
int lsm_type;
|
|
int child_subreaper;
|
|
bool has_clone3_set_tid;
|
|
+ bool pin_memory;
|
|
} __aligned(64);
|
|
|
|
/*
|
|
@@ -317,4 +318,27 @@ enum {
|
|
#define __r_sym(name) restorer_sym ## name
|
|
#define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name))
|
|
|
|
+#define PIN_MEM_FILE "/dev/pinmem"
|
|
+#define PIN_MEM_MAGIC 0x59
|
|
+#define _SET_PIN_MEM_AREA 1
|
|
+#define _CLEAR_PIN_MEM_AREA 2
|
|
+#define _REMAP_PIN_MEM_AREA 3
|
|
+#define _PIN_MEM_IOC_MAX_NR 4
|
|
+#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set)
|
|
+#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int)
|
|
+#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int)
|
|
+
|
|
+#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024
|
|
+#define MAX_PIN_MEM_AREA_NUM 16
|
|
+struct pin_mem_area {
|
|
+ unsigned long virt_start;
|
|
+ unsigned long virt_end;
|
|
+};
|
|
+
|
|
+struct pin_mem_area_set {
|
|
+ unsigned int pid;
|
|
+ unsigned int area_num;
|
|
+ struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM];
|
|
+};
|
|
+
|
|
#endif /* __CR_RESTORER_H__ */
|
|
diff --git a/criu/mem.c b/criu/mem.c
|
|
index 167838b..709de4e 100644
|
|
--- a/criu/mem.c
|
|
+++ b/criu/mem.c
|
|
@@ -438,6 +438,88 @@ again:
|
|
return ret;
|
|
}
|
|
|
|
+bool should_pin_vmae(VmaEntry *vmae)
|
|
+{
|
|
+ /*
|
|
+ * vDSO area must be always dumped because on restore
|
|
+ * we might need to generate a proxy.
|
|
+ */
|
|
+ if (vma_entry_is(vmae, VMA_AREA_VDSO))
|
|
+ return false;
|
|
+ /*
|
|
+ * In turn VVAR area is special and referenced from
|
|
+ * vDSO area by IP addressing (at least on x86) thus
|
|
+ * never ever dump its content but always use one provided
|
|
+ * by the kernel on restore, ie runtime VVAR area must
|
|
+ * be remapped into proper place..
|
|
+ */
|
|
+ if (vma_entry_is(vmae, VMA_AREA_VVAR))
|
|
+ return false;
|
|
+
|
|
+ if (vma_entry_is(vmae, VMA_AREA_AIORING))
|
|
+ return false;
|
|
+ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) {
|
|
+ pr_debug("find private anon vma: %lx-%lx\n", vmae->start, vmae->end);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static int pin_one_pmas(int fd, unsigned long start,
|
|
+ unsigned long *pend, struct pstree_item *item)
|
|
+{
|
|
+ int ret;
|
|
+ unsigned int index = 0;
|
|
+ unsigned long end;
|
|
+ unsigned long next = start;
|
|
+ struct pin_mem_area_set pmas;
|
|
+ struct pin_mem_area *pma;
|
|
+
|
|
+ end = *pend;
|
|
+ while (start < end) {
|
|
+ next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT);
|
|
+ pma = &(pmas.mem_area[index]);
|
|
+ pma->virt_start = start;
|
|
+ pma->virt_end = next;
|
|
+ pr_info("start pin %lx-%lx\n", start, next);
|
|
+ index++;
|
|
+ start += ONCE_PIN_MEM_SIZE_LIMIT;
|
|
+ if (index >= MAX_PIN_MEM_AREA_NUM)
|
|
+ break;
|
|
+ }
|
|
+ *pend = next;
|
|
+ pmas.area_num = index;
|
|
+ pmas.pid = vpid(item);
|
|
+ pr_info("begin pin memory for pid:%d\n", pmas.pid);
|
|
+ ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas);
|
|
+ if (ret < 0)
|
|
+ pr_err("pin mem fail, errno: %s\n", strerror(errno));
|
|
+ return ret;
|
|
+}
|
|
+static int pin_vmae(VmaEntry *vmae, struct pstree_item *item)
|
|
+{
|
|
+ int fd;
|
|
+ int ret = 0;
|
|
+ unsigned long start, end;
|
|
+
|
|
+ fd = open(PIN_MEM_FILE, O_RDWR);
|
|
+ if (fd < 0) {
|
|
+ pr_err("open file: %s fail.\n", PIN_MEM_FILE);
|
|
+ return -1;
|
|
+ }
|
|
+ start = vmae->start;
|
|
+ while (start < vmae->end) {
|
|
+ end = vmae->end;
|
|
+ ret = pin_one_pmas(fd, start, &end, item);
|
|
+ if (ret < 0)
|
|
+ break;
|
|
+ start = end;
|
|
+ }
|
|
+ close(fd);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int __parasite_dump_pages_seized(struct pstree_item *item,
|
|
struct parasite_dump_pages_args *args,
|
|
struct vm_area_list *vma_area_list,
|
|
@@ -513,7 +595,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
|
|
if (possible_pid_reuse == -1)
|
|
goto out_xfer;
|
|
}
|
|
-
|
|
+ if (opts.pin_memory) {
|
|
+ /* pin memory before dump pages */
|
|
+ list_for_each_entry(vma_area, &vma_area_list->h, list) {
|
|
+ if (should_pin_vmae(vma_area->e)) {
|
|
+ ret = pin_vmae(vma_area->e, item);
|
|
+ if (ret)
|
|
+ goto out_xfer;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
|
|
/*
|
|
* Step 1 -- generate the pagemap
|
|
@@ -524,6 +615,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
|
|
parent_predump_mode = mdc->parent_ie->pre_dump_mode;
|
|
|
|
list_for_each_entry(vma_area, &vma_area_list->h, list) {
|
|
+ if (opts.pin_memory && should_pin_vmae(vma_area->e))
|
|
+ continue;
|
|
+
|
|
ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
|
|
&pmc, has_parent, mdc->pre_dump,
|
|
parent_predump_mode);
|
|
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
|
|
index c63f96b..f3bd541 100644
|
|
--- a/criu/pie/restorer.c
|
|
+++ b/criu/pie/restorer.c
|
|
@@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args)
|
|
return 0;
|
|
}
|
|
|
|
+int remap_vmas(int pid)
|
|
+{
|
|
+ int fd, ret = 0;
|
|
+
|
|
+ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0);
|
|
+ if (fd == -1) {
|
|
+ pr_err("open file: %s fail.\n", PIN_MEM_FILE);
|
|
+ return -1;;
|
|
+ }
|
|
+
|
|
+ ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid);
|
|
+ if (ret < 0)
|
|
+ pr_err("remap pin mem fail for pid: %d\n", pid);
|
|
+ sys_close(fd);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+
|
|
/*
|
|
* The main routine to restore task via sigreturn.
|
|
* This one is very special, we never return there
|
|
@@ -1585,7 +1603,8 @@ long __export_restore_task(struct task_restore_args *args)
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
-
|
|
+ if (args->pin_memory)
|
|
+ remap_vmas(my_pid);
|
|
/*
|
|
* Now read the contents (if any)
|
|
*/
|
|
--
|
|
1.8.3.1
|
|
|