From 2965faa5e03d1e71e9ff9aa143fff39e0a77543a Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 9 Sep 2015 15:38:55 -0700 Subject: kexec: split kexec_load syscall from kexec core code There are two kexec load syscalls, kexec_load another and kexec_file_load. kexec_file_load has been splited as kernel/kexec_file.c. In this patch I split kexec_load syscall code to kernel/kexec.c. And add a new kconfig option KEXEC_CORE, so we can disable kexec_load and use kexec_file_load only, or vice verse. The original requirement is from Ted Ts'o, he want kexec kernel signature being checked with CONFIG_KEXEC_VERIFY_SIG enabled. But kexec-tools use kexec_load syscall can bypass the checking. Vivek Goyal proposed to create a common kconfig option so user can compile in only one syscall for loading kexec kernel. KEXEC/KEXEC_FILE selects KEXEC_CORE so that old config files still work. Because there's general code need CONFIG_KEXEC_CORE, so I updated all the architecture Kconfig with a new option KEXEC_CORE, and let KEXEC selects KEXEC_CORE in arch Kconfig. Also updated general kernel code with to kexec_load syscall. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Young Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Petr Tesarik Cc: Theodore Ts'o Cc: Josh Boyer Cc: David Howells Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 1495 +------------------------------------------------------- 1 file changed, 2 insertions(+), 1493 deletions(-) (limited to 'kernel/kexec.c') diff --git a/kernel/kexec.c b/kernel/kexec.c index 2d73ecfa5505..4c5edc357923 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,148 +1,23 @@ /* - * kexec.c - kexec system call + * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt - #include #include #include -#include -#include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include +#include -#include -#include #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Flag to indicate we are going to kexec a new kernel */ -bool kexec_in_progress = false; - - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in do_exit() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in . - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); - static int copy_user_segment_list(struct kimage *image, unsigned long nr_segments, struct kexec_segment __user *segments) @@ -160,123 +35,6 @@ static int copy_user_segment_list(struct kimage *image, return ret; } -int sanity_check_segment_list(struct kimage *image) -{ - int result, i; - unsigned long nr_segments = image->nr_segments; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addresses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - return result; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - return result; - } - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - - if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; - } - } - - return 0; -} - -struct kimage *do_kimage_alloc_init(void) -{ - struct kimage *image; - - /* Allocate a controlling structure */ - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - return NULL; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); - - return image; -} - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -343,597 +101,6 @@ out_free_image: return ret; } -int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everything to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unusable_pages); - -} -void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - - /* - * Free up any temporary buffers allocated. This might hit if - * error occurred much later after buffer allocation. - */ - if (image->file_mode) - kimage_file_post_load_cleanup(image); - - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unusable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - if (mchunk > uchunk) { - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -954,9 +121,6 @@ int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; -int kexec_load_disabled; SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) @@ -1051,18 +215,6 @@ out: return result; } -/* - * Add and remove page tables for crashkernel memory - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, nr_segments, @@ -1101,646 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return sys_kexec_load(entry, nr_segments, ksegments, flags); } #endif - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -size_t crash_get_memory_size(void) -{ - size_t size = 0; - mutex_lock(&kexec_mutex); - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); - return size; -} - -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; - - mutex_lock(&kexec_mutex); - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_map_reserved_pages(); - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; - ram_res->name = "System RAM"; - - crashk_res.end = end - 1; - - insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); - -unlock: - mutex_unlock(&kexec_mutex); - return ret; -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __weak paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - lock_system_sleep(); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_end(PMSG_FREEZE); - if (error) - goto Resume_devices; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - local_irq_disable(); - error = syscore_suspend(); - if (error) - goto Enable_irqs; - } else -#endif - { - kexec_in_progress = true; - kernel_restart_prepare(NULL); - migrate_to_reboot_cpu(); - - /* - * migrate_to_reboot_cpu() disables CPU hotplug assuming that - * no further code needs to use CPU hotplug (which is true in - * the reboot case). However, the kexec path depends on using - * CPU hotplug again; so re-enable it here. - */ - cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - syscore_resume(); - Enable_irqs: - local_irq_enable(); - Enable_cpus: - enable_nonboot_cpus(); - dpm_resume_start(PMSG_RESTORE); - Resume_devices: - dpm_resume_end(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - unlock_system_sleep(); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} -- cgit