diff options
Diffstat (limited to 'drivers/xen/balloon.c')
| -rw-r--r-- | drivers/xen/balloon.c | 366 |
1 files changed, 221 insertions, 145 deletions
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ceb5048de9a7..49c3f9926394 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,6 +43,8 @@ #include <linux/sched.h> #include <linux/cred.h> #include <linux/errno.h> +#include <linux/freezer.h> +#include <linux/kthread.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/pagemap.h> @@ -56,10 +58,10 @@ #include <linux/percpu-defs.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/moduleparam.h> +#include <linux/jiffies.h> #include <asm/page.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/xen/hypervisor.h> @@ -73,44 +75,29 @@ #include <xen/page.h> #include <xen/mem-reservation.h> -static int xen_hotplug_unpopulated; +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static uint __read_mostly balloon_boot_timeout = 180; +module_param(balloon_boot_timeout, uint, 0444); -static int zero; -static int one = 1; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static int xen_hotplug_unpopulated; -static struct ctl_table balloon_table[] = { +static const struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", .data = &xen_hotplug_unpopulated, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, - { } -}; - -static struct ctl_table balloon_root[] = { - { - .procname = "balloon", - .mode = 0555, - .child = balloon_table, - }, - { } -}; - -static struct ctl_table xen_root[] = { - { - .procname = "xen", - .mode = 0555, - .child = balloon_root, - }, - { } }; +#else +#define xen_hotplug_unpopulated 0 #endif /* @@ -120,7 +107,7 @@ static struct ctl_table xen_root[] = { #define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) /* - * balloon_process() state: + * balloon_thread() state: * * BP_DONE: done or nothing to do, * BP_WAIT: wait to be rescheduled, @@ -128,13 +115,15 @@ static struct ctl_table xen_root[] = { * BP_ECANCELED: error, balloon operation canceled. */ -enum bp_state { +static enum bp_state { BP_DONE, BP_WAIT, BP_EAGAIN, BP_ECANCELED -}; +} balloon_state = BP_DONE; +/* Main waiting point for xen-balloon thread. */ +static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq); static DEFINE_MUTEX(balloon_mutex); @@ -149,18 +138,17 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; static LIST_HEAD(ballooned_pages); static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); -/* Main work function, always executed in process context. */ -static void balloon_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); - /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ #define GFP_BALLOON \ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) /* balloon_append: add the given page to the balloon. */ -static void __balloon_append(struct page *page) +static void balloon_append(struct page *page) { + if (!PageOffline(page)) + __SetPageOffline(page); + /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); @@ -169,12 +157,9 @@ static void __balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } - wake_up(&balloon_wq); -} + inc_node_page_state(page, NR_BALLOON_PAGES); -static void balloon_append(struct page *page) -{ - __balloon_append(page); + wake_up(&balloon_wq); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -195,6 +180,9 @@ static struct page *balloon_retrieve(bool require_lowmem) else balloon_stats.balloon_low--; + __ClearPageOffline(page); + dec_node_page_state(page, NR_BALLOON_PAGES); + return page; } @@ -206,18 +194,15 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static enum bp_state update_schedule(enum bp_state state) +static void update_schedule(void) { - if (state == BP_WAIT) - return BP_WAIT; - - if (state == BP_ECANCELED) - return BP_ECANCELED; + if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED) + return; - if (state == BP_DONE) { + if (balloon_state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_DONE; + return; } ++balloon_stats.retry_count; @@ -226,7 +211,8 @@ static enum bp_state update_schedule(enum bp_state state) balloon_stats.retry_count > balloon_stats.max_retry_count) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; - return BP_ECANCELED; + balloon_state = BP_ECANCELED; + return; } balloon_stats.schedule_delay <<= 1; @@ -234,7 +220,7 @@ static enum bp_state update_schedule(enum bp_state state) if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; - return BP_EAGAIN; + balloon_state = BP_EAGAIN; } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG @@ -272,20 +258,6 @@ static struct resource *additional_memory_resource(phys_addr_t size) return NULL; } -#ifdef CONFIG_SPARSEMEM - { - unsigned long limit = 1UL << (MAX_PHYSMEM_BITS - PAGE_SHIFT); - unsigned long pfn = res->start >> PAGE_SHIFT; - - if (pfn > limit) { - pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n", - pfn, limit); - release_memory_resource(res); - return NULL; - } - } -#endif - return res; } @@ -330,7 +302,7 @@ static enum bp_state reserve_additional_memory(void) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { unsigned long pfn, i; pfn = PFN_DOWN(resource->start); @@ -352,7 +324,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource); + rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); @@ -369,21 +341,25 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static void xen_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(page); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + balloon_append(p); + } mutex_unlock(&balloon_mutex); } static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { if (val == MEM_ONLINE) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); return NOTIFY_OK; } @@ -395,7 +371,8 @@ static struct notifier_block xen_memory_nb = { #else static enum bp_state reserve_additional_memory(void) { - balloon_stats.target_pages = balloon_stats.current_pages; + balloon_stats.target_pages = balloon_stats.current_pages + + balloon_stats.target_unpopulated; return BP_ECANCELED; } #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ @@ -440,7 +417,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); - /* Relinquish the page back to the allocator. */ + /* + * Relinquish the page back to the allocator. Note that + * some pages, including ones added via xen_online_page(), might + * not be marked reserved; free_reserved_page() will handle that. + */ free_reserved_page(page); } @@ -507,43 +488,79 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) } /* - * As this is a work item it is guaranteed to run as a single instance only. + * Stop waiting if either state is BP_DONE and ballooning action is + * needed, or if the credit has changed while state is not BP_DONE. + */ +static bool balloon_thread_cond(long credit) +{ + if (balloon_state == BP_DONE) + credit = 0; + + return current_credit() != credit || kthread_should_stop(); +} + +/* + * As this is a kthread it is guaranteed to run as a single instance only. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(struct work_struct *work) +static int balloon_thread(void *unused) { - enum bp_state state = BP_DONE; long credit; + unsigned long timeout; + + set_freezable(); + for (;;) { + switch (balloon_state) { + case BP_DONE: + case BP_ECANCELED: + timeout = 3600 * HZ; + break; + case BP_EAGAIN: + timeout = balloon_stats.schedule_delay * HZ; + break; + case BP_WAIT: + timeout = HZ; + break; + } + + credit = current_credit(); + + wait_event_freezable_timeout(balloon_thread_wq, + balloon_thread_cond(credit), timeout); + if (kthread_should_stop()) + return 0; - do { mutex_lock(&balloon_mutex); credit = current_credit(); if (credit > 0) { if (balloon_is_inflated()) - state = increase_reservation(credit); + balloon_state = increase_reservation(credit); else - state = reserve_additional_memory(); + balloon_state = reserve_additional_memory(); } - if (credit < 0) - state = decrease_reservation(-credit, GFP_BALLOON); + if (credit < 0) { + long n_pages; + + n_pages = min(-credit, si_mem_available()); + balloon_state = decrease_reservation(n_pages, + GFP_BALLOON); + if (balloon_state == BP_DONE && n_pages != -credit && + n_pages < totalreserve_pages) + balloon_state = BP_EAGAIN; + } - state = update_schedule(state); + update_schedule(); mutex_unlock(&balloon_mutex); cond_resched(); - - } while (credit && state == BP_DONE); - - /* Schedule more work if there is some still to be done. */ - if (state == BP_EAGAIN) - schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + } } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -551,25 +568,30 @@ void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); } EXPORT_SYMBOL_GPL(balloon_set_new_target); -static int add_ballooned_pages(int nr_pages) +static int add_ballooned_pages(unsigned int nr_pages) { enum bp_state st; if (xen_hotplug_unpopulated) { st = reserve_additional_memory(); if (st != BP_ECANCELED) { + int rc; + mutex_unlock(&balloon_mutex); - wait_event(balloon_wq, + rc = wait_event_interruptible(balloon_wq, !list_empty(&ballooned_pages)); mutex_lock(&balloon_mutex); - return 0; + return rc ? -ENOMEM : 0; } } + if (si_mem_available() < nr_pages) + return -ENOMEM; + st = decrease_reservation(nr_pages, GFP_USER); if (st != BP_DONE) return -ENOMEM; @@ -578,14 +600,14 @@ static int add_ballooned_pages(int nr_pages) } /** - * alloc_xenballooned_pages - get pages that have been ballooned out + * xen_alloc_ballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages) +int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int pgno = 0; + unsigned int pgno = 0; struct page *page; int ret; @@ -604,7 +626,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; @@ -620,19 +642,25 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) return 0; out_undo: mutex_unlock(&balloon_mutex); - free_xenballooned_pages(pgno, pages); + xen_free_ballooned_pages(pgno, pages); + /* + * NB: xen_free_ballooned_pages will only subtract pgno pages, but since + * target_unpopulated is incremented with nr_pages at the start we need + * to remove the remaining ones also, or accounting will be screwed. + */ + balloon_stats.target_unpopulated -= nr_pages - pgno; return ret; } -EXPORT_SYMBOL(alloc_xenballooned_pages); +EXPORT_SYMBOL(xen_alloc_ballooned_pages); /** - * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * xen_free_ballooned_pages - return pages retrieved with get_ballooned_pages * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page **pages) +void xen_free_ballooned_pages(unsigned int nr_pages, struct page **pages) { - int i; + unsigned int i; mutex_lock(&balloon_mutex); @@ -645,52 +673,70 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) - schedule_delayed_work(&balloon_worker, 0); + wake_up(&balloon_thread_wq); mutex_unlock(&balloon_mutex); } -EXPORT_SYMBOL(free_xenballooned_pages); +EXPORT_SYMBOL(xen_free_ballooned_pages); -#ifdef CONFIG_XEN_PV -static void __init balloon_add_region(unsigned long start_pfn, - unsigned long pages) +static int __init balloon_add_regions(void) { + unsigned long start_pfn, pages; unsigned long pfn, extra_pfn_end; - struct page *page; + unsigned int i; - /* - * If the amount of usable memory has been limited (e.g., with - * the 'mem' command line parameter), don't add pages beyond - * this limit. - */ - extra_pfn_end = min(max_pfn, start_pfn + pages); - - for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { - page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not - include the boot-time balloon extension, so - don't subtract from it. */ - __balloon_append(page); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + pages = xen_extra_mem[i].n_pfns; + if (!pages) + continue; + + start_pfn = xen_extra_mem[i].start_pfn; + + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) + balloon_append(pfn_to_page(pfn)); + + /* + * Extra regions are accounted for in the physmap, but need + * decreasing from current_pages and target_pages to balloon + * down the initial allocation, because they are already + * accounted for in total_pages. + */ + pages = extra_pfn_end - start_pfn; + if (pages >= balloon_stats.current_pages || + pages >= balloon_stats.target_pages) { + WARN(1, "Extra pages underflow current target"); + return -ERANGE; + } + balloon_stats.current_pages -= pages; + balloon_stats.target_pages -= pages; } - balloon_stats.total_pages += extra_pfn_end - start_pfn; + return 0; } -#endif static int __init balloon_init(void) { + struct task_struct *task; + int rc; + if (!xen_domain()) return -ENODEV; pr_info("Initialising balloon driver\n"); -#ifdef CONFIG_XEN_PV - balloon_stats.current_pages = xen_pv_domain() - ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : get_num_physpages(); -#else - balloon_stats.current_pages = get_num_physpages(); -#endif + if (xen_released_pages >= get_num_physpages()) { + WARN(1, "Released pages underflow current target"); + return -ERANGE; + } + + balloon_stats.current_pages = get_num_physpages() - xen_released_pages; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -699,28 +745,23 @@ static int __init balloon_init(void) balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; balloon_stats.retry_count = 1; - balloon_stats.max_retry_count = RETRY_UNLIMITED; + balloon_stats.max_retry_count = 4; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); - register_sysctl_table(xen_root); + register_sysctl_init("xen/balloon", balloon_table); #endif -#ifdef CONFIG_XEN_PV - { - int i; + rc = balloon_add_regions(); + if (rc) + return rc; - /* - * Initialize the balloon with pages from the extra memory - * regions (see arch/x86/xen/setup.c). - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].n_pfns) - balloon_add_region(xen_extra_mem[i].start_pfn, - xen_extra_mem[i].n_pfns); + task = kthread_run(balloon_thread, NULL, "xen-balloon"); + if (IS_ERR(task)) { + pr_err("xen-balloon thread could not be started, ballooning will not work!\n"); + return PTR_ERR(task); } -#endif /* Init the xen-balloon driver. */ xen_balloon_init(); @@ -728,3 +769,38 @@ static int __init balloon_init(void) return 0; } subsys_initcall(balloon_init); + +static int __init balloon_wait_finish(void) +{ + long credit, last_credit = 0; + unsigned long last_changed = 0; + + if (!xen_domain()) + return -ENODEV; + + /* PV guests don't need to wait. */ + if (xen_pv_domain() || !current_credit()) + return 0; + + pr_notice("Waiting for initial ballooning down having finished.\n"); + + while ((credit = current_credit()) < 0) { + if (credit != last_credit) { + last_changed = jiffies; + last_credit = credit; + } + if (balloon_state == BP_ECANCELED) { + pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n", + -credit); + if (time_is_before_eq_jiffies(last_changed + HZ * balloon_boot_timeout)) + panic("Initial ballooning failed!\n"); + } + + schedule_timeout_interruptible(HZ / 10); + } + + pr_notice("Initial ballooning down finished.\n"); + + return 0; +} +late_initcall_sync(balloon_wait_finish); |
