> + page_cache_release(page);
> + page = NULL;
> + }
> +
> + return page;
> +}
> +
> +/**
> + * cr_private_vma_fill_pgarr - fill a page-array with addr/page tuples
> + * @ctx - checkpoint context
> + * @pgarr - page-array to fill
> + * @vma - vma to scan
> + * @start - start address (updated)
> + *
> + * Returns the number of pages collected
> + */
> +static int
> +cr_private_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
> + struct vm_area_struct *vma, unsigned long *start)
> +{
> + unsigned long end = vma->vm_end;
> + unsigned long addr = *start;
> + int orig_used = pgarr->nr_used;
> +
> + /* this function is only for private memory (anon or file-mapped) */
> + BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
> +
> + while (addr < end) {
> + struct page *page;
> +
> + page = cr_follow_page(vma, addr);
> + if (IS_ERR(page))
> + return PTR_ERR(page);
> +
> + if (page) {
> + pgarr->pages[pgarr->nr_used] = page;
> + pgarr->vaddrs[pgarr->nr_used] = addr;
> + pgarr->nr_used++;
> + }
> +
> + addr += PAGE_SIZE;
> +
> + if (cr_pgarr_is_full(pgarr))
> + break;
> + }
> +
> + *start = addr;
> + return pgarr->nr_used - orig_used;
> +}
> +
> +/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
> +static int cr_page_write(struct cr_ctx *ctx, struct page *page, char *buf)
> +{
> + void *ptr;
> +
> + ptr = kmap_atomic(page, KM_USER1);
> + memcpy(buf, ptr, PAGE_SIZE);
> + kunmap_atomic(page, KM_USER1);
> +
> + return cr_kwrite(ctx, buf, PAGE_SIZE);
> +}
> +
> +/**
> + * cr_vma_dump_pages - dump pages listed in the ctx page-array chain
> + * @ctx - checkpoint context
> + * @total - total number of pages
> + *
> + * First dump all virtual addresses, followed by the contents of all pages
> + */
> +static int cr_vma_dump_pages(struct cr_ctx *ctx, int total)
> +{
> + struct cr_pgarr *pgarr;
> + char *buf;
> + int i, ret = 0;
> +
> + if (!total)
> + return 0;
> +
> + list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
> + ret = cr_kwrite(ctx, pgarr->vaddrs,
> + pgarr->nr_used * sizeof(*pgarr->vaddrs));
> + if (ret < 0)
> + return ret;
> + }
> +
> + buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
> +
> + list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
> + for (i = 0; i < pgarr->nr_used; i++) {
> + ret = cr_page_write(ctx, pgarr->pages[i], buf);
> + if (ret < 0)
> + goto out;
> + }
> + }
> +
> + out:
> + kfree(buf);
> + return ret;
> +}
> +
> +/**
> + * cr_write_private_vma_contents - dump contents of a VMA with private memory
> + * @ctx - checkpoint context
> + * @vma - vma to scan
> + *
> + * Collect lists of pages that needs to be dumped, and corresponding
> + * virtual addresses into ctx->pgarr_list page-array chain. Then dump
> + * the addresses, followed by the page contents.
> + */
> +static int
> +cr_write_private_vma_contents(struct cr_ctx *ctx, struct vm_area_struct *vma)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_pgarr *hh;
> + unsigned long addr = vma->vm_start;
> + struct cr_pgarr *pgarr;
> + unsigned long cnt = 0;
> + int ret;
> +
> + /*
> + * Work iteratively, collecting and dumping at most CR_PGARR_CHUNK
> + * in each round. Each iterations is divided into two steps:
> + *
> + * (1) scan: scan through the PTEs of the vma to collect the pages
> + * to dump (later we'll also make them COW), while keeping a list
> + * of pages and their corresponding addresses on ctx->pgarr_list.
> + *
> + * (2) dump: write out a header specifying how many pages, followed
> + * by the addresses of all pages in ctx->pgarr_list, followed by
> + * the actual contents of all pages. (Then, release the references
> + * to the pages and reset the page-array chain).
> + *
> + * (This split makes the logic simpler by first counting the pages
> + * that need saving. More importantly, it allows for a future
> + * optimization that will reduce application downtime by deferring
> + * the actual write-out of the data to after the application is
> + * allowed to resume execution).
> + *
> + * After dumpting the entire contents, conclude with a header that
> + * specifies 0 pages to mark the end of the contents.
> + */
> +
> + h.type = CR_HDR_PGARR;
> + h.len = sizeof(*hh);
> + h.parent = 0;
> +
> + while (addr < vma->vm_end) {
> + pgarr = cr_pgarr_current(ctx);
> + if (!pgarr)
> + return -ENOMEM;
> + ret = cr_private_vma_fill_pgarr(ctx, pgarr, vma, &addr);
> + if (ret < 0)
> + return ret;
> + cnt += ret;
> +
> + /* did we complete a chunk, or is this the last chunk ? */
> + if (cnt >= CR_PGARR_CHUNK || (cnt && addr == vma->vm_end)) {
> + hh = cr_hbuf_get(ctx, sizeof(*hh));
> + hh->nr_pages = cnt;
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + return ret;
> +
> + ret = cr_vma_dump_pages(ctx, cnt);
> + if (ret < 0)
> + return ret;
> +
> + cr_pgarr_reset_all(ctx);
> + }
> + }
> +
> + /* mark end of contents with header saying "0" pages */
> + hh = cr_hbuf_get(ctx, sizeof(*hh));
> + hh->nr_pages = 0;
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> +
> + return ret;
> +}
> +
> +static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_vma *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + int vma_type, ret;
> +
> + h.type = CR_HDR_VMA;
> + h.len = sizeof(*hh);
> + h.parent = 0;
> +
> + hh->vm_start = vma->vm_start;
> + hh->vm_end = vma->vm_end;
> + hh->vm_page_prot = vma->vm_page_prot.pgprot;
> + hh->vm_flags = vma->vm_flags;
> + hh->vm_pgoff = vma->vm_pgoff;
> +
> + if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) {
> + pr_warning("CR: unsupported VMA %#lx\n", vma->vm_flags);
> + return -ETXTBSY;
> + }
> +
> + /* by default assume anon memory */
> + vma_type = CR_VMA_ANON;
> +
> + /*
> + * if there is a backing file, assume private-mapped
> + * (FIXME: check if the file is unlinked)
> + */
> + if (vma->vm_file)
> + vma_type = CR_VMA_FILE;
> +
> + hh->vma_type = vma_type;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> +
> + if (ret < 0)
> + return ret;
> +
> + /* save the file name, if relevant */
> + if (vma->vm_file) {
> + ret = cr_write_fname(ctx, &vma->vm_file->f_path, ctx->vfsroot);
> + if (ret < 0)
> + return ret;
> + }
> +
> + ret = cr_write_private_vma_contents(ctx, vma);
> +
> + return ret;
> +}
> +
> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_mm *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct mm_struct *mm;
> + struct vm_area_struct *vma;
> + int objref, ret;
> +
> + h.type = CR_HDR_MM;
> + h.len = sizeof(*hh);
> + h.parent = task_pid_vnr(t);
> +
> + mm = get_task_mm(t);
> +
> + objref = 0; /* will be meaningful with multiple processes */
> + hh->objref = objref;
> +
> + down_read(&mm->mmap_sem);
> +
> + hh->start_code = mm->start_code;
> + hh->end_code = mm->end_code;
> + hh->start_data = mm->start_data;
> + hh->end_data = mm->end_data;
> + hh->start_brk = mm->start_brk;
> + hh->brk = mm->brk;
> + hh->start_stack = mm->start_stack;
> + hh->arg_start = mm->arg_start;
> + hh->arg_end = mm->arg_end;
> + hh->env_start = mm->env_start;
> + hh->env_end = mm->env_end;
> +
> + hh->map_count = mm->map_count;
> +
> + /* FIX: need also mm->flags */
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + goto out;
> +
> + /* write the vma's */
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + ret = cr_write_vma(ctx, vma);
> + if (ret < 0)
> + goto out;
> + }
> +
> + ret = cr_write_mm_context(ctx, mm, objref);
> +
> + out:
> + up_read(&mm->mmap_sem);
> + mmput(mm);
> + return ret;
> +}
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 30863c6..c4ac157 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -16,6 +16,8 @@
> #include <linux/capability.h>
> #include <linux/checkpoint.h>
>
> +#include "checkpoint_mem.h"
> +
> /*
> * helpers to write/read to/from the image file descriptor
> *
> @@ -111,7 +113,6 @@ int cr_kread(struct cr_ctx *ctx, void *buf, int count)
> return ret;
> }
>
> -
> /*
> * helpers to manage CR contexts: allocated for each checkpoint and/or
> * restart operation, and persists until the operation is completed.
> @@ -127,6 +128,11 @@ void cr_ctx_free(struct cr_ctx *ctx)
>
> free_pages((unsigned long) ctx->hbuf, CR_HBUF_ORDER);
>
> + if (ctx->vfsroot)
> + path_put(ctx->vfsroot);
> +
> + cr_pgarr_free(ctx);
> +
> kfree(ctx);
> }
>
> @@ -145,10 +151,17 @@ struct cr_ctx *cr_ctx_alloc(pid_t pid, int fd, unsigned long flags)
> }
>
> ctx->hbuf = (void *) __get_free_pages(GFP_KERNEL, CR_HBUF_ORDER);
> - if (!ctx->hbuf) {
> - cr_ctx_free(ctx);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!ctx->hbuf)
> + goto nomem;
> +
> + /*
> + * assume checkpointer is in container's root vfs
> + * FIXME: this works for now, but will change with real containers
> + */
> + ctx->vfsroot = ¤t->fs->root;
> + path_get(ctx->vfsroot);
> +
> + INIT_LIST_HEAD(&ctx->pgarr_list);
>
> ctx->pid = pid;
> ctx->flags = flags;
> @@ -156,6 +169,10 @@ struct cr_ctx *cr_ctx_alloc(pid_t pid, int fd, unsigned long flags)
> ctx->crid = atomic_inc_return(&cr_ctx_count);
>
> return ctx;
> +
> + nomem:
> + cr_ctx_free(ctx);
> + return ERR_PTR(-ENOMEM);
> }
>
> /*
> diff --git a/include/asm-x86/checkpoint_hdr.h b/include/asm-x86/checkpoint_hdr.h
> index 44a903c..6bc61ac 100644
> --- a/include/asm-x86/checkpoint_hdr.h
> +++ b/include/asm-x86/checkpoint_hdr.h
> @@ -69,4 +69,9 @@ struct cr_hdr_cpu {
>
> } __attribute__((aligned(8)));
>
> +struct cr_hdr_mm_context {
> + __s16 ldt_entry_size;
> + __s16 nldt;
> +} __attribute__((aligned(8)));
> +
> #endif /* __ASM_X86_CKPT_HDR__H */
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 5e53ae6..d74e64d 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,6 +10,9 @@
> * distribution for more details.
> */
>
> +#include <linux/path.h>
> +#include <linux/fs.h>
> +
> #define CR_VERSION 1
>
> struct cr_ctx {
> @@ -24,6 +27,10 @@ struct cr_ctx {
>
> void *hbuf; /* temporary buffer for headers */
> int hpos; /* position in headers buffer */
> +
> + struct list_head pgarr_list; /* page array to dump VMA contents */
> +
> + struct path *vfsroot; /* container root (FIXME) */
> };
>
> /* cr_ctx: flags */
> @@ -46,11 +53,16 @@ struct cr_hdr;
>
> extern int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf);
> extern int cr_write_string(struct cr_ctx *ctx, char *str, int len);
> +extern int cr_write_fname(struct cr_ctx *ctx,
> + struct path *path, struct path *root);
>
> extern int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n);
> extern int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type);
> extern int cr_read_string(struct cr_ctx *ctx, void *str, int len);
>
> +extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
> +extern int cr_read_mm(struct cr_ctx *ctx);
> +
> extern int do_checkpoint(struct cr_ctx *ctx);
> extern int do_restart(struct cr_ctx *ctx);
>
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 03ec72e..2b110f1 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -32,6 +32,7 @@ struct cr_hdr {
> enum {
> CR_HDR_HEAD = 1,
> CR_HDR_STRING,
> + CR_HDR_FNAME,
>
> CR_HDR_TASK = 101,
> CR_HDR_THREAD,
> @@ -39,6 +40,7 @@ enum {
>
> CR_HDR_MM = 201,
> CR_HDR_VMA,
> + CR_HDR_PGARR,
> CR_HDR_MM_CONTEXT,
>
> CR_HDR_TAIL = 5001
> @@ -73,4 +75,34 @@ struct cr_hdr_task {
> __s32 task_comm_len;
> } __attribute__((aligned(8)));
>
> +struct cr_hdr_mm {
> + __u32 objref; /* identifier for shared objects */
> + __u32 map_count;
> +
> + __u64 start_code, end_code, start_data, end_data;
> + __u64 start_brk, brk, start_stack;
> + __u64 arg_start, arg_end, env_start, env_end;
> +} __attribute__((aligned(8)));
> +
> +/* vma subtypes */
> +enum vm_type {
> + CR_VMA_ANON = 1,
> + CR_VMA_FILE
> +};
> +
> +struct cr_hdr_vma {
> + __u32 vma_type;
> + __u32 _padding;
> +
> + __u64 vm_start;
> + __u64 vm_end;
> + __u64 vm_page_prot;
> + __u64 vm_flags;
> + __u64 vm_pgoff;
> +} __attribute__((aligned(8)));
> +
> +struct cr_hdr_pgarr {
> + __u64 nr_pages; /* number of pages to saved */
> +} __attribute__((aligned(8)));
> +
> #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> --
> 1.5.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to
majordomo@vger.kernel.org
> More majordomo info at
http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at
http://www.tux.org/lkml/
>