diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries/lpar.c')
| -rw-r--r-- | arch/powerpc/platforms/pseries/lpar.c | 1595 |
1 files changed, 1438 insertions, 157 deletions
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 02d6e21619bb..6a415febc53b 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1,48 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * pSeries_lpar.c * Copyright (C) 2001 Todd Inglett, IBM Corporation * * pSeries LPAR support. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Enables debugging of low-level hash table routines - careful! */ #undef DEBUG +#define pr_fmt(fmt) "lpar: " fmt #include <linux/kernel.h> #include <linux/dma-mapping.h> #include <linux/console.h> #include <linux/export.h> +#include <linux/jump_label.h> +#include <linux/delay.h> +#include <linux/seq_file.h> +#include <linux/stop_machine.h> +#include <linux/spinlock.h> +#include <linux/cpuhotplug.h> +#include <linux/workqueue.h> +#include <linux/proc_fs.h> +#include <linux/pgtable.h> +#include <linux/debugfs.h> + #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/machdep.h> +#include <asm/setup.h> #include <asm/mmu_context.h> #include <asm/iommu.h> -#include <asm/tlbflush.h> #include <asm/tlb.h> -#include <asm/prom.h> #include <asm/cputable.h> +#include <asm/papr-sysparm.h> #include <asm/udbg.h> #include <asm/smp.h> #include <asm/trace.h> #include <asm/firmware.h> +#include <asm/plpar_wrappers.h> +#include <asm/kexec.h> +#include <asm/fadump.h> +#include <asm/dtl.h> +#include <asm/vphn.h> -#include "plpar_wrappers.h" #include "pseries.h" /* Flag bits for H_BULK_REMOVE */ @@ -58,15 +59,632 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); -extern void pSeries_find_serial_port(void); +#ifdef CONFIG_PPC_64S_HASH_MMU +/* + * H_BLOCK_REMOVE supported block size for this page size in segment who's base + * page size is that page size. + * + * The first index is the segment base page size, the second one is the actual + * page size. + */ +static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; +#endif + +/* + * Due to the involved complexity, and that the current hypervisor is only + * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE + * buffer size to 8 size block. + */ +#define HBLKRM_SUPPORTED_BLOCK_SIZE 8 + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +static u8 dtl_mask = DTL_LOG_PREEMPT; +#else +static u8 dtl_mask; +#endif + +void alloc_dtl_buffers(unsigned long *time_limit) +{ + int cpu; + struct paca_struct *pp; + struct dtl_entry *dtl; + + for_each_possible_cpu(cpu) { + pp = paca_ptrs[cpu]; + if (pp->dispatch_log) + continue; + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); + if (!dtl) { + pr_warn("Failed to allocate dispatch trace log for cpu %d\n", + cpu); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + pr_warn("Stolen time statistics will be unreliable\n"); +#endif + break; + } + + pp->dtl_ridx = 0; + pp->dispatch_log = dtl; + pp->dispatch_log_end = dtl + N_DISPATCH_LOG; + pp->dtl_curr = dtl; + + if (time_limit && time_after(jiffies, *time_limit)) { + cond_resched(); + *time_limit = jiffies + HZ; + } + } +} + +void register_dtl_buffer(int cpu) +{ + long ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + int hwcpu = get_hard_smp_processor_id(cpu); + + pp = paca_ptrs[cpu]; + dtl = pp->dispatch_log; + if (dtl && dtl_mask) { + pp->dtl_ridx = 0; + pp->dtl_curr = dtl; + lppaca_of(cpu).dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); + ret = register_dtl(hwcpu, __pa(dtl)); + if (ret) + pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n", + cpu, hwcpu, ret); + + lppaca_of(cpu).dtl_enable_mask = dtl_mask; + } +} + +#ifdef CONFIG_PPC_SPLPAR +struct dtl_worker { + struct delayed_work work; + int cpu; +}; + +struct vcpu_dispatch_data { + int last_disp_cpu; + + int total_disp; + + int same_cpu_disp; + int same_chip_disp; + int diff_chip_disp; + int far_chip_disp; + + int numa_home_disp; + int numa_remote_disp; + int numa_far_disp; +}; + +/* + * This represents the number of cpus in the hypervisor. Since there is no + * architected way to discover the number of processors in the host, we + * provision for dealing with NR_CPUS. This is currently 2048 by default, and + * is sufficient for our purposes. This will need to be tweaked if + * CONFIG_NR_CPUS is changed. + */ +#define NR_CPUS_H NR_CPUS + +DECLARE_RWSEM(dtl_access_lock); +static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data); +static DEFINE_PER_CPU(u64, dtl_entry_ridx); +static DEFINE_PER_CPU(struct dtl_worker, dtl_workers); +static enum cpuhp_state dtl_worker_state; +static DEFINE_MUTEX(dtl_enable_mutex); +static int vcpudispatch_stats_on __read_mostly; +static int vcpudispatch_stats_freq = 50; +static __be32 *vcpu_associativity, *pcpu_associativity; + + +static void free_dtl_buffers(unsigned long *time_limit) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + int cpu; + struct paca_struct *pp; + + for_each_possible_cpu(cpu) { + pp = paca_ptrs[cpu]; + if (!pp->dispatch_log) + continue; + kmem_cache_free(dtl_cache, pp->dispatch_log); + pp->dtl_ridx = 0; + pp->dispatch_log = NULL; + pp->dispatch_log_end = NULL; + pp->dtl_curr = NULL; + + if (time_limit && time_after(jiffies, *time_limit)) { + cond_resched(); + *time_limit = jiffies + HZ; + } + } +#endif +} + +static int init_cpu_associativity(void) +{ + vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core, + VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL); + pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core, + VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL); + + if (!vcpu_associativity || !pcpu_associativity) { + pr_err("error allocating memory for associativity information\n"); + return -ENOMEM; + } + + return 0; +} + +static void destroy_cpu_associativity(void) +{ + kfree(vcpu_associativity); + kfree(pcpu_associativity); + vcpu_associativity = pcpu_associativity = NULL; +} + +static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag) +{ + __be32 *assoc; + int rc = 0; + + assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE]; + if (!assoc[0]) { + rc = hcall_vphn(cpu, flag, &assoc[0]); + if (rc) + return NULL; + } + + return assoc; +} + +static __be32 *get_pcpu_associativity(int cpu) +{ + return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU); +} + +static __be32 *get_vcpu_associativity(int cpu) +{ + return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU); +} + +static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) +{ + __be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc; + + if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H) + return -EINVAL; + + last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu); + cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu); + + if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) + return -EIO; + + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); +} + +static int cpu_home_node_dispatch_distance(int disp_cpu) +{ + __be32 *disp_cpu_assoc, *vcpu_assoc; + int vcpu_id = smp_processor_id(); + + if (disp_cpu >= NR_CPUS_H) { + pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n", + disp_cpu, NR_CPUS_H); + return -EINVAL; + } + + disp_cpu_assoc = get_pcpu_associativity(disp_cpu); + vcpu_assoc = get_vcpu_associativity(vcpu_id); + + if (!disp_cpu_assoc || !vcpu_assoc) + return -EIO; + + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); +} + +static void update_vcpu_disp_stat(int disp_cpu) +{ + struct vcpu_dispatch_data *disp; + int distance; + + disp = this_cpu_ptr(&vcpu_disp_data); + if (disp->last_disp_cpu == -1) { + disp->last_disp_cpu = disp_cpu; + return; + } + + disp->total_disp++; + + if (disp->last_disp_cpu == disp_cpu || + (cpu_first_thread_sibling(disp->last_disp_cpu) == + cpu_first_thread_sibling(disp_cpu))) + disp->same_cpu_disp++; + else { + distance = cpu_relative_dispatch_distance(disp->last_disp_cpu, + disp_cpu); + if (distance < 0) + pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n", + smp_processor_id()); + else { + switch (distance) { + case 0: + disp->same_chip_disp++; + break; + case 1: + disp->diff_chip_disp++; + break; + case 2: + disp->far_chip_disp++; + break; + default: + pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n", + smp_processor_id(), + disp->last_disp_cpu, + disp_cpu, + distance); + } + } + } + + distance = cpu_home_node_dispatch_distance(disp_cpu); + if (distance < 0) + pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n", + smp_processor_id()); + else { + switch (distance) { + case 0: + disp->numa_home_disp++; + break; + case 1: + disp->numa_remote_disp++; + break; + case 2: + disp->numa_far_disp++; + break; + default: + pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n", + smp_processor_id(), + disp_cpu, + distance); + } + } + + disp->last_disp_cpu = disp_cpu; +} + +static void process_dtl_buffer(struct work_struct *work) +{ + struct dtl_entry dtle; + u64 i = __this_cpu_read(dtl_entry_ridx); + struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + struct dtl_worker *d = container_of(work, struct dtl_worker, work.work); + + if (!local_paca->dispatch_log) + return; + + /* if we have been migrated away, we cancel ourself */ + if (d->cpu != smp_processor_id()) { + pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n", + smp_processor_id()); + return; + } + + if (i == be64_to_cpu(vpa->dtl_idx)) + goto out; + + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtle = *dtl; + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n", + d->cpu, + be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i); + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id)); + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + + __this_cpu_write(dtl_entry_ridx, i); + +out: + schedule_delayed_work_on(d->cpu, to_delayed_work(work), + HZ / vcpudispatch_stats_freq); +} + +static int dtl_worker_online(unsigned int cpu) +{ + struct dtl_worker *d = &per_cpu(dtl_workers, cpu); + + memset(d, 0, sizeof(*d)); + INIT_DELAYED_WORK(&d->work, process_dtl_buffer); + d->cpu = cpu; + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + per_cpu(dtl_entry_ridx, cpu) = 0; + register_dtl_buffer(cpu); +#else + per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx); +#endif + + schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq); + return 0; +} + +static int dtl_worker_offline(unsigned int cpu) +{ + struct dtl_worker *d = &per_cpu(dtl_workers, cpu); + + cancel_delayed_work_sync(&d->work); + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + unregister_dtl(get_hard_smp_processor_id(cpu)); +#endif + + return 0; +} + +static void set_global_dtl_mask(u8 mask) +{ + int cpu; + + dtl_mask = mask; + for_each_present_cpu(cpu) + lppaca_of(cpu).dtl_enable_mask = dtl_mask; +} + +static void reset_global_dtl_mask(void) +{ + int cpu; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + dtl_mask = DTL_LOG_PREEMPT; +#else + dtl_mask = 0; +#endif + for_each_present_cpu(cpu) + lppaca_of(cpu).dtl_enable_mask = dtl_mask; +} + +static int dtl_worker_enable(unsigned long *time_limit) +{ + int rc = 0, state; + + if (!down_write_trylock(&dtl_access_lock)) { + rc = -EBUSY; + goto out; + } + + set_global_dtl_mask(DTL_LOG_ALL); + + /* Setup dtl buffers and register those */ + alloc_dtl_buffers(time_limit); + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online", + dtl_worker_online, dtl_worker_offline); + if (state < 0) { + pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n"); + free_dtl_buffers(time_limit); + reset_global_dtl_mask(); + up_write(&dtl_access_lock); + rc = -EINVAL; + goto out; + } + dtl_worker_state = state; + +out: + return rc; +} + +static void dtl_worker_disable(unsigned long *time_limit) +{ + cpuhp_remove_state(dtl_worker_state); + free_dtl_buffers(time_limit); + reset_global_dtl_mask(); + up_write(&dtl_access_lock); +} + +static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, + size_t count, loff_t *ppos) +{ + unsigned long time_limit = jiffies + HZ; + struct vcpu_dispatch_data *disp; + int rc, cmd, cpu; + char buf[16]; + + if (count > 15) + return -EINVAL; + + if (copy_from_user(buf, p, count)) + return -EFAULT; + + buf[count] = 0; + rc = kstrtoint(buf, 0, &cmd); + if (rc || cmd < 0 || cmd > 1) { + pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n"); + return rc ? rc : -EINVAL; + } + + mutex_lock(&dtl_enable_mutex); + + if ((cmd == 0 && !vcpudispatch_stats_on) || + (cmd == 1 && vcpudispatch_stats_on)) + goto out; + + if (cmd) { + rc = init_cpu_associativity(); + if (rc) { + destroy_cpu_associativity(); + goto out; + } + + for_each_possible_cpu(cpu) { + disp = per_cpu_ptr(&vcpu_disp_data, cpu); + memset(disp, 0, sizeof(*disp)); + disp->last_disp_cpu = -1; + } + + rc = dtl_worker_enable(&time_limit); + if (rc) { + destroy_cpu_associativity(); + goto out; + } + } else { + dtl_worker_disable(&time_limit); + destroy_cpu_associativity(); + } + + vcpudispatch_stats_on = cmd; + +out: + mutex_unlock(&dtl_enable_mutex); + if (rc) + return rc; + return count; +} + +static int vcpudispatch_stats_display(struct seq_file *p, void *v) +{ + int cpu; + struct vcpu_dispatch_data *disp; + + if (!vcpudispatch_stats_on) { + seq_puts(p, "off\n"); + return 0; + } + + for_each_online_cpu(cpu) { + disp = per_cpu_ptr(&vcpu_disp_data, cpu); + seq_printf(p, "cpu%d", cpu); + seq_put_decimal_ull(p, " ", disp->total_disp); + seq_put_decimal_ull(p, " ", disp->same_cpu_disp); + seq_put_decimal_ull(p, " ", disp->same_chip_disp); + seq_put_decimal_ull(p, " ", disp->diff_chip_disp); + seq_put_decimal_ull(p, " ", disp->far_chip_disp); + seq_put_decimal_ull(p, " ", disp->numa_home_disp); + seq_put_decimal_ull(p, " ", disp->numa_remote_disp); + seq_put_decimal_ull(p, " ", disp->numa_far_disp); + seq_puts(p, "\n"); + } + + return 0; +} + +static int vcpudispatch_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, vcpudispatch_stats_display, NULL); +} + +static const struct proc_ops vcpudispatch_stats_proc_ops = { + .proc_open = vcpudispatch_stats_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static ssize_t vcpudispatch_stats_freq_write(struct file *file, + const char __user *p, size_t count, loff_t *ppos) +{ + int rc, freq; + char buf[16]; + + if (count > 15) + return -EINVAL; + + if (copy_from_user(buf, p, count)) + return -EFAULT; + + buf[count] = 0; + rc = kstrtoint(buf, 0, &freq); + if (rc || freq < 1 || freq > HZ) { + pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n", + HZ); + return rc ? rc : -EINVAL; + } + + vcpudispatch_stats_freq = freq; + + return count; +} + +static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v) +{ + seq_printf(p, "%d\n", vcpudispatch_stats_freq); + return 0; +} + +static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file) +{ + return single_open(file, vcpudispatch_stats_freq_display, NULL); +} + +static const struct proc_ops vcpudispatch_stats_freq_proc_ops = { + .proc_open = vcpudispatch_stats_freq_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_freq_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int __init vcpudispatch_stats_procfs_init(void) +{ + if (!lppaca_shared_proc()) + return 0; + + if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL, + &vcpudispatch_stats_proc_ops)) + pr_err("vcpudispatch_stats: error creating procfs file\n"); + else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL, + &vcpudispatch_stats_freq_proc_ops)) + pr_err("vcpudispatch_stats_freq: error creating procfs file\n"); + + return 0; +} + +machine_device_initcall(pseries, vcpudispatch_stats_procfs_init); + +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +u64 pseries_paravirt_steal_clock(int cpu) +{ + struct lppaca *lppaca = &lppaca_of(cpu); + + /* + * VPA steal time counters are reported at TB frequency. Hence do a + * conversion to ns before returning + */ + return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) + + be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb))); +} +#endif + +#endif /* CONFIG_PPC_SPLPAR */ void vpa_init(int cpu) { int hwcpu = get_hard_smp_processor_id(cpu); unsigned long addr; long ret; - struct paca_struct *pp; - struct dtl_entry *dtl; + + /* + * The spec says it "may be problematic" if CPU x registers the VPA of + * CPU y. We should never do that, but wail if we ever do. + */ + WARN_ON(cpu != smp_processor_id()); if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca_of(cpu).vmxregs_in_use = 1; @@ -82,40 +700,60 @@ void vpa_init(int cpu) "%lx failed with %ld\n", cpu, hwcpu, addr, ret); return; } + +#ifdef CONFIG_PPC_64S_HASH_MMU /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(&slb_shadow[cpu]); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) { + addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr); ret = register_slb_shadow(hwcpu, addr); if (ret) pr_err("WARNING: SLB shadow buffer registration for " "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } +#endif /* CONFIG_PPC_64S_HASH_MMU */ /* * Register dispatch trace log, if one has been allocated. */ - pp = &paca[cpu]; - dtl = pp->dispatch_log; - if (dtl) { - pp->dtl_ridx = 0; - pp->dtl_curr = dtl; - lppaca_of(cpu).dtl_idx = 0; + register_dtl_buffer(cpu); +} - /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; - ret = register_dtl(hwcpu, __pa(dtl)); - if (ret) - pr_err("WARNING: DTL registration of cpu %d (hw %d) " - "failed with %ld\n", smp_processor_id(), - hwcpu, ret); - lppaca_of(cpu).dtl_enable_mask = 2; +#ifdef CONFIG_PPC_BOOK3S_64 + +static int __init pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = 0; + + if (table_size) + flags |= PROC_TABLE_NEW; + if (radix_enabled()) { + flags |= PROC_TABLE_RADIX; + if (mmu_has_feature(MMU_FTR_GTSE)) + flags |= PROC_TABLE_GTSE; + } else + flags |= PROC_TABLE_HPT_SLB; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); } + return rc; } +#ifdef CONFIG_PPC_64S_HASH_MMU + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -145,16 +783,12 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Exact = 0 */ flags = 0; - /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~_PAGE_COHERENT; if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) flags |= H_COALESCE_CAND; lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot); if (unlikely(lpar_rc == H_PTEG_FULL)) { - if (!(vflags & HPTE_V_BOLTED)) - pr_devel(" full\n"); + pr_devel("Hash table group is full\n"); return -1; } @@ -164,8 +798,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, * or we will loop forever, so return -2 in this case. */ if (unlikely(lpar_rc != H_SUCCESS)) { - if (!(vflags & HPTE_V_BOLTED)) - pr_devel(" lpar err %ld\n", lpar_rc); + pr_err("Failed hash pte insert with error %ld\n", lpar_rc); return -2; } if (!(vflags & HPTE_V_BOLTED)) @@ -193,7 +826,7 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) /* don't remove a bolted entry */ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset, - (0x1UL << 4), &dummy1, &dummy2); + HPTE_V_BOLTED, &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; @@ -211,7 +844,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) return -1; } -static void pSeries_lpar_hptab_clear(void) +/* Called during kexec sequence with MMU off */ +static notrace void manual_hpte_clear_all(void) { unsigned long size_bytes = 1UL << ppc64_pft_size; unsigned long hpte_count = size_bytes >> 4; @@ -228,8 +862,11 @@ static void pSeries_lpar_hptab_clear(void) */ for (i = 0; i < hpte_count; i += 4) { lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes); - if (lpar_rc != H_SUCCESS) + if (lpar_rc != H_SUCCESS) { + pr_info("Failed to read hash page table at %ld err %ld\n", + i, lpar_rc); continue; + } for (j = 0; j < 4; j++){ if ((ptes[j].pteh & HPTE_V_VRMA_MASK) == HPTE_V_VRMA_MASK) @@ -241,6 +878,43 @@ static void pSeries_lpar_hptab_clear(void) } } +/* Called during kexec sequence with MMU off */ +static notrace int hcall_hpte_clear_all(void) +{ + int rc; + + do { + rc = plpar_hcall_norets(H_CLEAR_HPT); + } while (rc == H_CONTINUE); + + return rc; +} + +/* Called during kexec sequence with MMU off */ +static notrace void pseries_hpte_clear_all(void) +{ + int rc; + + rc = hcall_hpte_clear_all(); + if (rc != H_SUCCESS) + manual_hpte_clear_all(); + +#ifdef __LITTLE_ENDIAN__ + /* + * Reset exceptions to big endian. + * + * FIXME this is a hack for kexec, we need to reset the exception + * endian before starting the new kernel and this is a convenient place + * to do it. + * + * This is also called on boot when a fadump happens. In that case we + * must not change the exception endian mode. + */ + if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) + pseries_big_endian_exceptions(); +#endif +} + /* * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and * the low 3 bits of flags happen to line up. So no transform is needed. @@ -251,14 +925,20 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, unsigned long vpn, int psize, int apsize, - int ssize, int local) + int ssize, unsigned long inv_flags) { unsigned long lpar_rc; - unsigned long flags = (newpp & 7) | H_AVPN; + unsigned long flags; unsigned long want_v; want_v = hpte_encode_avpn(vpn, psize, ssize); + flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN; + flags |= (newpp & HPTE_R_KEY_HI) >> 48; + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); @@ -276,48 +956,59 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, return 0; } -static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) +static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group) { - unsigned long dword0; - unsigned long lpar_rc; - unsigned long dummy_word1; - unsigned long flags; + long lpar_rc; + unsigned long i, j; + struct { + unsigned long pteh; + unsigned long ptel; + } ptes[4]; - /* Read 1 pte at a time */ - /* Do not need RPN to logical page translation */ - /* No cross CEC PFT access */ - flags = 0; + for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { - lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1); + lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); + if (lpar_rc != H_SUCCESS) { + pr_info("Failed to read hash page table at %ld err %ld\n", + hpte_group, lpar_rc); + continue; + } - BUG_ON(lpar_rc != H_SUCCESS); + for (j = 0; j < 4; j++) { + if (HPTE_V_COMPARE(ptes[j].pteh, want_v) && + (ptes[j].pteh & HPTE_V_VALID)) + return i + j; + } + } - return dword0; + return -1; } static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) { - unsigned long hash; - unsigned long i; long slot; - unsigned long want_v, hpte_v; + unsigned long hash; + unsigned long want_v; + unsigned long hpte_group; hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); - /* Bolted entries are always in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - hpte_v = pSeries_lpar_hpte_getword0(slot); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* HPTE matches */ - return slot; - ++slot; + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __pSeries_lpar_hpte_find(want_v, hpte_group); + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __pSeries_lpar_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; } - - return -1; -} + return hpte_group + slot; +} static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, @@ -332,7 +1023,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - flags = newpp & 7; + flags = newpp & (HPTE_R_PP | HPTE_R_N); + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + /* Move pp0 into bit 8 (IBM 55) */ + flags |= (newpp & HPTE_R_PP0) >> 55; + + flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO); + lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_SUCCESS); @@ -357,23 +1054,148 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, BUG_ON(lpar_rc != H_SUCCESS); } + +/* + * As defined in the PAPR's section 14.5.4.1.8 + * The control mask doesn't include the returned reference and change bit from + * the processed PTE. + */ +#define HBLKR_AVPN 0x0100000000000000UL +#define HBLKR_CTRL_MASK 0xf800000000000000UL +#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL +#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL +#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL + +/* + * Returned true if we are supporting this block size for the specified segment + * base page size and actual page size. + * + * Currently, we only support 8 size block. + */ +static inline bool is_supported_hlbkrm(int bpsize, int psize) +{ + return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE); +} + +/** + * H_BLOCK_REMOVE caller. + * @idx should point to the latest @param entry set with a PTEX. + * If PTE cannot be processed because another CPUs has already locked that + * group, those entries are put back in @param starting at index 1. + * If entries has to be retried and @retry_busy is set to true, these entries + * are retried until success. If @retry_busy is set to false, the returned + * is the number of entries yet to process. + */ +static unsigned long call_block_remove(unsigned long idx, unsigned long *param, + bool retry_busy) +{ + unsigned long i, rc, new_idx; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + if (idx < 2) { + pr_warn("Unexpected empty call to H_BLOCK_REMOVE"); + return 0; + } +again: + new_idx = 0; + if (idx > PLPAR_HCALL9_BUFSIZE) { + pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx); + idx = PLPAR_HCALL9_BUFSIZE; + } else if (idx < PLPAR_HCALL9_BUFSIZE) + param[idx] = HBR_END; + + rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf, + param[0], /* AVA */ + param[1], param[2], param[3], param[4], /* TS0-7 */ + param[5], param[6], param[7], param[8]); + if (rc == H_SUCCESS) + return 0; + + BUG_ON(rc != H_PARTIAL); + + /* Check that the unprocessed entries were 'not found' or 'busy' */ + for (i = 0; i < idx-1; i++) { + unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK; + + if (ctrl == HBLKR_CTRL_ERRBUSY) { + param[++new_idx] = param[i+1]; + continue; + } + + BUG_ON(ctrl != HBLKR_CTRL_SUCCESS + && ctrl != HBLKR_CTRL_ERRNOTFOUND); + } + + /* + * If there were entries found busy, retry these entries if requested, + * of if all the entries have to be retried. + */ + if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) { + idx = new_idx + 1; + goto again; + } + + return new_idx; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need * to make sure that we avoid bouncing the hypervisor tlbie lock. */ #define PPC64_HUGE_HPTE_BATCH 12 -static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, - unsigned long *vpn, int count, - int psize, int ssize) +static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) { - unsigned long param[8]; - int i = 0, pix = 0, rc; - unsigned long flags = 0; - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + unsigned long param[PLPAR_HCALL9_BUFSIZE]; + unsigned long shift, current_vpgb, vpgb; + int i, pix = 0; - if (lock_tlbie) - spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + shift = mmu_psize_defs[psize].shift; + + for (i = 0; i < count; i++) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn[i], psize, ssize); + pix = 1; + } + + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i]; + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } + if (pix) + (void)call_block_remove(pix, param, true); +} + +static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) +{ + unsigned long param[PLPAR_HCALL9_BUFSIZE]; + int i = 0, pix = 0, rc; for (i = 0; i < count; i++) { @@ -401,21 +1223,40 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, param[6], param[7]); BUG_ON(rc != H_SUCCESS); } +} + +static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, + int count, int psize, + int ssize) +{ + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + /* Assuming THP size is 16M */ + if (is_supported_hlbkrm(psize, MMU_PAGE_16M)) + hugepage_block_invalidate(slot, vpn, count, psize, ssize); + else + hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); } -static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, - unsigned char *hpte_slot_array, - unsigned long addr, int psize) +static void pSeries_lpar_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) { - int ssize = 0, i, index = 0; + int i, index = 0; unsigned long s_addr = addr; unsigned int max_hpte_count, valid; unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH]; unsigned long slot_array[PPC64_HUGE_HPTE_BATCH]; - unsigned long shift, hidx, vpn = 0, vsid, hash, slot; + unsigned long shift, hidx, vpn = 0, hash, slot; shift = mmu_psize_defs[psize].shift; max_hpte_count = 1U << (PMD_SHIFT - shift); @@ -428,15 +1269,6 @@ static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, /* get the vpn */ addr = s_addr + (i * (1ul << shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - vpn = hpt_vpn(addr, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); if (hidx & _PTEIDX_SECONDARY) @@ -463,9 +1295,18 @@ static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array, index, psize, ssize); } +#else +static void pSeries_lpar_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + WARN(1, "%s called without THP support\n", __func__); +} +#endif -static void pSeries_lpar_hpte_removebolted(unsigned long ea, - int psize, int ssize) +static int pSeries_lpar_hpte_removebolted(unsigned long ea, + int psize, int ssize) { unsigned long vpn; unsigned long slot, vsid; @@ -474,11 +1315,208 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea, vpn = hpt_vpn(ea, vsid, ssize); slot = pSeries_lpar_hpte_find(vpn, psize, ssize); - BUG_ON(slot == -1); + if (slot == -1) + return -ENOENT; + /* * lpar doesn't use the passed actual page size */ pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0); + return 0; +} + + +static inline unsigned long compute_slot(real_pte_t pte, + unsigned long vpn, + unsigned long index, + unsigned long shift, + int ssize) +{ + unsigned long slot, hash, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + return slot; +} + +/** + * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are + * "all within the same naturally aligned 8 page virtual address block". + */ +static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, + unsigned long *param) +{ + unsigned long vpn; + unsigned long i, pix = 0; + unsigned long index, shift, slot, current_vpgb, vpgb; + real_pte_t pte; + int psize, ssize; + + psize = batch->psize; + ssize = batch->ssize; + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, + true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn, psize, + ssize); + pix = 1; + } + + slot = compute_slot(pte, vpn, index, shift, ssize); + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot; + + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } pte_iterate_hashed_end(); + } + + if (pix) + (void)call_block_remove(pix, param, true); +} + +/* + * TLB Block Invalidate Characteristics + * + * These characteristics define the size of the block the hcall H_BLOCK_REMOVE + * is able to process for each couple segment base page size, actual page size. + * + * The ibm,get-system-parameter properties is returning a buffer with the + * following layout: + * + * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ] + * ----------------- + * TLB Block Invalidate Specifiers: + * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ] + * [ 1 byte Number of page sizes (N) that are supported for the specified + * TLB invalidate block size ] + * [ 1 byte Encoded segment base page size and actual page size + * MSB=0 means 4k segment base page size and actual page size + * MSB=1 the penc value in mmu_psize_def ] + * ... + * ----------------- + * Next TLB Block Invalidate Specifiers... + * ----------------- + * [ 0 ] + */ +static inline void set_hblkrm_bloc_size(int bpsize, int psize, + unsigned int block_size) +{ + if (block_size > hblkrm_size[bpsize][psize]) + hblkrm_size[bpsize][psize] = block_size; +} + +/* + * Decode the Encoded segment base page size and actual page size. + * PAPR specifies: + * - bit 7 is the L bit + * - bits 0-5 are the penc value + * If the L bit is 0, this means 4K segment base page size and actual page size + * otherwise the penc value should be read. + */ +#define HBLKRM_L_MASK 0x80 +#define HBLKRM_PENC_MASK 0x3f +static inline void __init check_lp_set_hblkrm(unsigned int lp, + unsigned int block_size) +{ + unsigned int bpsize, psize; + + /* First, check the L bit, if not set, this means 4K */ + if ((lp & HBLKRM_L_MASK) == 0) { + set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size); + return; + } + + lp &= HBLKRM_PENC_MASK; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) { + struct mmu_psize_def *def = &mmu_psize_defs[bpsize]; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (def->penc[psize] == lp) { + set_hblkrm_bloc_size(bpsize, psize, block_size); + return; + } + } + } +} + +/* + * The size of the TLB Block Invalidate Characteristics is variable. But at the + * maximum it will be the number of possible page sizes *2 + 10 bytes. + * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size + * (128 bytes) for the buffer to get plenty of space. + */ +#define SPLPAR_TLB_BIC_MAXLENGTH 128 + +void __init pseries_lpar_read_hblkrm_characteristics(void) +{ + static struct papr_sysparm_buf buf __initdata; + int len, idx, bpsize; + + if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + return; + + if (papr_sysparm_get(PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS, &buf)) + return; + + len = be16_to_cpu(buf.len); + if (len > SPLPAR_TLB_BIC_MAXLENGTH) { + pr_warn("%s too large returned buffer %d", __func__, len); + return; + } + + idx = 0; + while (idx < len) { + u8 block_shift = buf.val[idx++]; + u32 block_size; + unsigned int npsize; + + if (!block_shift) + break; + + block_size = 1 << block_shift; + + for (npsize = buf.val[idx++]; + npsize > 0 && idx < len; npsize--) + check_lp_set_hblkrm((unsigned int)buf.val[idx++], + block_size); + } + + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (idx = 0; idx < MMU_PAGE_COUNT; idx++) + if (hblkrm_size[bpsize][idx]) + pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d", + bpsize, idx, hblkrm_size[bpsize][idx]); } /* @@ -490,16 +1528,21 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) unsigned long vpn; unsigned long i, pix, rc; unsigned long flags = 0; - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - unsigned long param[9]; - unsigned long hash, index, shift, hidx, slot; + unsigned long param[PLPAR_HCALL9_BUFSIZE]; + unsigned long index, shift, slot; real_pte_t pte; int psize, ssize; if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + if (is_supported_hlbkrm(batch->psize, batch->psize)) { + do_block_remove(number, batch, param); + goto out; + } + psize = batch->psize; ssize = batch->ssize; pix = 0; @@ -507,12 +1550,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) vpn = batch->vpn[i]; pte = batch->pte[i]; pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; + slot = compute_slot(pte, vpn, index, shift, ssize); if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { /* * lpar doesn't use the passed actual page size @@ -543,6 +1581,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) BUG_ON(rc != H_SUCCESS); } +out: if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); } @@ -551,26 +1590,151 @@ static int __init disable_bulk_remove(char *str) { if (strcmp(str, "off") == 0 && firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { - printk(KERN_INFO "Disabling BULK_REMOVE firmware feature"); - powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE; + pr_info("Disabling BULK_REMOVE firmware feature"); + powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE; } return 1; } __setup("bulk_remove=", disable_bulk_remove); -void __init hpte_init_lpar(void) +#define HPT_RESIZE_TIMEOUT 10000 /* ms */ + +struct hpt_resize_state { + unsigned long shift; + int commit_rc; +}; + +static int pseries_lpar_resize_hpt_commit(void *data) +{ + struct hpt_resize_state *state = data; + + state->commit_rc = plpar_resize_hpt_commit(0, state->shift); + if (state->commit_rc != H_SUCCESS) + return -EIO; + + /* Hypervisor has transitioned the HTAB, update our globals */ + ppc64_pft_size = state->shift; + htab_size_bytes = 1UL << ppc64_pft_size; + htab_hash_mask = (htab_size_bytes >> 7) - 1; + + return 0; +} + +/* + * Must be called in process context. The caller must hold the + * cpus_lock. + */ +static int pseries_lpar_resize_hpt(unsigned long shift) +{ + struct hpt_resize_state state = { + .shift = shift, + .commit_rc = H_FUNCTION, + }; + unsigned int delay, total_delay = 0; + int rc; + ktime_t t0, t1, t2; + + might_sleep(); + + if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE)) + return -ENODEV; + + pr_info("Attempting to resize HPT to shift %lu\n", shift); + + t0 = ktime_get(); + + rc = plpar_resize_hpt_prepare(0, shift); + while (H_IS_LONG_BUSY(rc)) { + delay = get_longbusy_msecs(rc); + total_delay += delay; + if (total_delay > HPT_RESIZE_TIMEOUT) { + /* prepare with shift==0 cancels an in-progress resize */ + rc = plpar_resize_hpt_prepare(0, 0); + if (rc != H_SUCCESS) + pr_warn("Unexpected error %d cancelling timed out HPT resize\n", + rc); + return -ETIMEDOUT; + } + msleep(delay); + rc = plpar_resize_hpt_prepare(0, shift); + } + + switch (rc) { + case H_SUCCESS: + /* Continue on */ + break; + + case H_PARAMETER: + pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n"); + return -EINVAL; + case H_RESOURCE: + pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n"); + return -EPERM; + default: + pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc); + return -EIO; + } + + t1 = ktime_get(); + + rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit, + &state, NULL); + + t2 = ktime_get(); + + if (rc != 0) { + switch (state.commit_rc) { + case H_PTEG_FULL: + return -ENOSPC; + + default: + pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n", + state.commit_rc); + return -EIO; + }; + } + + pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n", + shift, (long long) ktime_ms_delta(t1, t0), + (long long) ktime_ms_delta(t2, t1)); + + return 0; +} + +void __init hpte_init_pseries(void) +{ + mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; + mmu_hash_ops.hpte_updatepp = pSeries_lpar_hpte_updatepp; + mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp; + mmu_hash_ops.hpte_insert = pSeries_lpar_hpte_insert; + mmu_hash_ops.hpte_remove = pSeries_lpar_hpte_remove; + mmu_hash_ops.hpte_removebolted = pSeries_lpar_hpte_removebolted; + mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; + mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; + mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; + + if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) + mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pseries_lpar_register_process_table(0, 0, 0); +} +#endif /* CONFIG_PPC_64S_HASH_MMU */ + +#ifdef CONFIG_PPC_RADIX_MMU +void __init radix_init_pseries(void) { - ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate; - ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp; - ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp; - ppc_md.hpte_insert = pSeries_lpar_hpte_insert; - ppc_md.hpte_remove = pSeries_lpar_hpte_remove; - ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; - ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; - ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; - ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; + pr_info("Using radix MMU under hypervisor\n"); + + pseries_lpar_register_process_table(__pa(process_tb), + 0, PRTB_SIZE_SHIFT - 12); } +#endif #ifdef CONFIG_PPC_SMLPAR #define CMO_FREE_HINT_DEFAULT 1 @@ -582,13 +1746,13 @@ static int __init cmo_free_hint(char *str) parm = strstrip(str); if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) { - printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n"); + pr_info("%s: CMO free page hinting is not active.\n", __func__); cmo_free_hint_flag = 0; return 1; } cmo_free_hint_flag = 1; - printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n"); + pr_info("%s: CMO free page hinting is active.\n", __func__); if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0) return 1; @@ -615,6 +1779,8 @@ static void pSeries_set_page_state(struct page *page, int order, void arch_free_page(struct page *page, int order) { + if (radix_enabled()) + return; if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) return; @@ -622,9 +1788,24 @@ void arch_free_page(struct page *page, int order) } EXPORT_SYMBOL(arch_free_page); -#endif +#endif /* CONFIG_PPC_SMLPAR */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS +#ifdef CONFIG_JUMP_LABEL +struct static_key hcall_tracepoint_key = STATIC_KEY_INIT; + +int hcall_tracepoint_regfunc(void) +{ + static_key_slow_inc(&hcall_tracepoint_key); + return 0; +} + +void hcall_tracepoint_unregfunc(void) +{ + static_key_slow_dec(&hcall_tracepoint_key); +} +#else /* * We optimise our hcall path by placing hcall_tracepoint_refcount * directly in the TOC so we can check if the hcall tracepoints are @@ -634,40 +1815,41 @@ EXPORT_SYMBOL(arch_free_page); /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ extern long hcall_tracepoint_refcount; -/* - * Since the tracing code might execute hcalls we need to guard against - * recursion. One example of this are spinlocks calling H_YIELD on - * shared processor partitions. - */ -static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); - -void hcall_tracepoint_regfunc(void) +int hcall_tracepoint_regfunc(void) { hcall_tracepoint_refcount++; + return 0; } void hcall_tracepoint_unregfunc(void) { hcall_tracepoint_refcount--; } +#endif -void __trace_hcall_entry(unsigned long opcode, unsigned long *args) +/* + * Keep track of hcall tracing depth and prevent recursion. Warn if any is + * detected because it may indicate a problem. This will not catch all + * problems with tracing code making hcalls, because the tracing might have + * been invoked from a non-hcall, so the first hcall could recurse into it + * without warning here, but this better than nothing. + * + * Hcalls with specific problems being traced should use the _notrace + * plpar_hcall variants. + */ +static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); + + +notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args) { unsigned long flags; unsigned int *depth; - /* - * We cannot call tracepoints inside RCU idle regions which - * means we must not trace H_CEDE. - */ - if (opcode == H_CEDE) - return; - local_irq_save(flags); - depth = &__get_cpu_var(hcall_trace_depth); + depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (WARN_ON_ONCE(*depth)) goto out; (*depth)++; @@ -679,20 +1861,16 @@ out: local_irq_restore(flags); } -void __trace_hcall_exit(long opcode, unsigned long retval, - unsigned long *retbuf) +notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf) { unsigned long flags; unsigned int *depth; - if (opcode == H_CEDE) - return; - local_irq_save(flags); - depth = &__get_cpu_var(hcall_trace_depth); + depth = this_cpu_ptr(&hcall_trace_depth); - if (*depth) + if (*depth) /* Don't warn again on the way out */ goto out; (*depth)++; @@ -709,10 +1887,10 @@ out: * h_get_mpp * H_GET_MPP hcall returns info in 7 parms */ -int h_get_mpp(struct hvcall_mpp_data *mpp_data) +long h_get_mpp(struct hvcall_mpp_data *mpp_data) { - int rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; rc = plpar_hcall9(H_GET_MPP, retbuf); @@ -724,7 +1902,7 @@ int h_get_mpp(struct hvcall_mpp_data *mpp_data) mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; - mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL; mpp_data->pool_size = retbuf[4]; mpp_data->loan_request = retbuf[5]; @@ -748,3 +1926,106 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } + +#ifdef CONFIG_PPC_64S_HASH_MMU +static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize) +{ + unsigned long protovsid; + unsigned long va_bits = VA_BITS; + unsigned long modinv, vsid_modulus; + unsigned long max_mod_inv, tmp_modinv; + + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + modinv = VSID_MULINV_256M; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1); + } else { + modinv = VSID_MULINV_1T; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1); + } + + /* + * vsid outside our range. + */ + if (vsid >= vsid_modulus) + return 0; + + /* + * If modinv is the modular multiplicate inverse of (x % vsid_modulus) + * and vsid = (protovsid * x) % vsid_modulus, then we say: + * protovsid = (vsid * modinv) % vsid_modulus + */ + + /* Check if (vsid * modinv) overflow (63 bits) */ + max_mod_inv = 0x7fffffffffffffffull / vsid; + if (modinv < max_mod_inv) + return (vsid * modinv) % vsid_modulus; + + tmp_modinv = modinv/max_mod_inv; + modinv %= max_mod_inv; + + protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus; + protovsid = (protovsid + vsid * modinv) % vsid_modulus; + + return protovsid; +} + +static int __init reserve_vrma_context_id(void) +{ + unsigned long protovsid; + + /* + * Reserve context ids which map to reserved virtual addresses. For now + * we only reserve the context id which maps to the VRMA VSID. We ignore + * the addresses in "ibm,adjunct-virtual-addresses" because we don't + * enable adjunct support via the "ibm,client-architecture-support" + * interface. + */ + protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T); + hash__reserve_context_id(protovsid >> ESID_BITS_1T); + return 0; +} +machine_device_initcall(pseries, reserve_vrma_context_id); +#endif + +#ifdef CONFIG_DEBUG_FS +/* debugfs file interface for vpa data */ +static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + int cpu = (long)filp->private_data; + struct lppaca *lppaca = &lppaca_of(cpu); + + return simple_read_from_buffer(buf, len, pos, lppaca, + sizeof(struct lppaca)); +} + +static const struct file_operations vpa_fops = { + .open = simple_open, + .read = vpa_file_read, + .llseek = default_llseek, +}; + +static int __init vpa_debugfs_init(void) +{ + char name[16]; + long i; + struct dentry *vpa_dir; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); + + /* set up the per-cpu vpa file*/ + for_each_possible_cpu(i) { + sprintf(name, "cpu-%ld", i); + debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops); + } + + return 0; +} +machine_arch_initcall(pseries, vpa_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ |
