summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/alternative.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/alternative.c')
-rw-r--r--arch/x86/kernel/alternative.c3136
1 files changed, 2732 insertions, 404 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ebeac487a20c..28518371d8bf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,26 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/stringify.h>
-#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <linux/stop_machine.h>
-#include <linux/slab.h>
-#include <linux/kdebug.h>
+#include <linux/execmem.h>
+
#include <asm/text-patching.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/pgtable.h>
-#include <asm/mce.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
#include <asm/nmi.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
int __read_mostly alternatives_patched;
@@ -28,11 +20,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
#define MAX_PATCH_LEN (255-1)
-static int __initdata_or_module debug_alternative;
+#define DA_ALL (~0)
+#define DA_ALT 0x01
+#define DA_RET 0x02
+#define DA_RETPOLINE 0x04
+#define DA_ENDBR 0x08
+#define DA_SMP 0x10
+
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
- debug_alternative = 1;
+ if (str && *str == '=')
+ str++;
+
+ if (!str || kstrtouint(str, 0, &debug_alternative))
+ debug_alternative = DA_ALL;
+
return 1;
}
__setup("debug-alternative", debug_alt);
@@ -46,310 +50,540 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);
-#define DPRINTK(fmt, args...) \
+#define DPRINTK(type, fmt, args...) \
do { \
- if (debug_alternative) \
- printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+ if (debug_alternative & DA_##type) \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
} while (0)
-#define DUMP_BYTES(buf, len, fmt, args...) \
+#define DUMP_BYTES(type, buf, len, fmt, args...) \
do { \
- if (unlikely(debug_alternative)) { \
+ if (unlikely(debug_alternative & DA_##type)) { \
int j; \
\
if (!(len)) \
break; \
\
- printk(KERN_DEBUG fmt, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
} \
} while (0)
-/*
- * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
- * that correspond to that nop. Getting from one nop to the next, we
- * add to the array the offset that is equal to the sum of all sizes of
- * nops preceding the one we are after.
- *
- * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
- * nice symmetry of sizes of the previous nops.
- */
-#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char intelnops[] =
-{
- GENERIC_NOP1,
- GENERIC_NOP2,
- GENERIC_NOP3,
- GENERIC_NOP4,
- GENERIC_NOP5,
- GENERIC_NOP6,
- GENERIC_NOP7,
- GENERIC_NOP8,
- GENERIC_NOP5_ATOMIC
-};
-static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+static const unsigned char x86nops[] =
{
- NULL,
- intelnops,
- intelnops + 1,
- intelnops + 1 + 2,
- intelnops + 1 + 2 + 3,
- intelnops + 1 + 2 + 3 + 4,
- intelnops + 1 + 2 + 3 + 4 + 5,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
+ BYTES_NOP1,
+ BYTES_NOP2,
+ BYTES_NOP3,
+ BYTES_NOP4,
+ BYTES_NOP5,
+ BYTES_NOP6,
+ BYTES_NOP7,
+ BYTES_NOP8,
+#ifdef CONFIG_64BIT
+ BYTES_NOP9,
+ BYTES_NOP10,
+ BYTES_NOP11,
#endif
-
-#ifdef K8_NOP1
-static const unsigned char k8nops[] =
-{
- K8_NOP1,
- K8_NOP2,
- K8_NOP3,
- K8_NOP4,
- K8_NOP5,
- K8_NOP6,
- K8_NOP7,
- K8_NOP8,
- K8_NOP5_ATOMIC
};
-static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+
+const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
{
NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops,
+ x86nops + 1,
+ x86nops + 1 + 2,
+ x86nops + 1 + 2 + 3,
+ x86nops + 1 + 2 + 3 + 4,
+ x86nops + 1 + 2 + 3 + 4 + 5,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+#endif
};
+
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
#endif
-#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char k7nops[] =
-{
- K7_NOP1,
- K7_NOP2,
- K7_NOP3,
- K7_NOP4,
- K7_NOP5,
- K7_NOP6,
- K7_NOP7,
- K7_NOP8,
- K7_NOP5_ATOMIC
-};
-static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
-{
- NULL,
- k7nops,
- k7nops + 1,
- k7nops + 1 + 2,
- k7nops + 1 + 2 + 3,
- k7nops + 1 + 2 + 3 + 4,
- k7nops + 1 + 2 + 3 + 4 + 5,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
#endif
+static void *its_page;
+static unsigned int its_offset;
+struct its_array its_pages;
-#ifdef P6_NOP1
-static const unsigned char p6nops[] =
-{
- P6_NOP1,
- P6_NOP2,
- P6_NOP3,
- P6_NOP4,
- P6_NOP5,
- P6_NOP6,
- P6_NOP7,
- P6_NOP8,
- P6_NOP5_ATOMIC
-};
-static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+static void *__its_alloc(struct its_array *pages)
{
- NULL,
- p6nops,
- p6nops + 1,
- p6nops + 1 + 2,
- p6nops + 1 + 2 + 3,
- p6nops + 1 + 2 + 3 + 4,
- p6nops + 1 + 2 + 3 + 4 + 5,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
-#endif
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
-/* Initialize these to a safe default */
-#ifdef CONFIG_X86_64
-const unsigned char * const *ideal_nops = p6_nops;
-#else
-const unsigned char * const *ideal_nops = intel_nops;
-#endif
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
-void __init arch_init_ideal_nops(void)
+ return no_free_ptr(page);
+}
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
+ u8 *bytes = thunk;
+ int offset = 0;
+ int i = 0;
+
+#ifdef CONFIG_FINEIBT
+ if (cfi_paranoid) {
/*
- * Due to a decoder implementation quirk, some
- * specific Intel CPUs actually perform better with
- * the "k8_nops" than with the SDM-recommended NOPs.
+ * When ITS uses indirect branch thunk the fineibt_paranoid
+ * caller sequence doesn't fit in the caller site. So put the
+ * remaining part of the sequence (UDB + JNE) into the ITS
+ * thunk.
*/
- if (boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model >= 0x0f &&
- boot_cpu_data.x86_model != 0x1c &&
- boot_cpu_data.x86_model != 0x26 &&
- boot_cpu_data.x86_model != 0x27 &&
- boot_cpu_data.x86_model < 0x30) {
- ideal_nops = k8_nops;
- } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
- ideal_nops = p6_nops;
- } else {
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- ideal_nops = intel_nops;
+ bytes[i++] = 0xd6; /* UDB */
+ bytes[i++] = 0x75; /* JNE */
+ bytes[i++] = 0xfd;
+
+ offset = 1;
+ }
#endif
- }
- break;
- case X86_VENDOR_HYGON:
- ideal_nops = p6_nops;
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* JMP *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk + offset;
+}
+
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
return;
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 > 0xf) {
- ideal_nops = p6_nops;
- return;
- }
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
- /* fall through */
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
- default:
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- if (boot_cpu_has(X86_FEATURE_K8))
- ideal_nops = k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- ideal_nops = k7_nops;
- else
- ideal_nops = intel_nops;
-#endif
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
+ execmem_free(page);
}
+ kfree(mod->arch.its_pages.pages);
}
+#endif /* CONFIG_MODULES */
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
+static void *its_alloc(void)
{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, ideal_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
+ struct its_array *pages = &its_pages;
+ void *page;
+
+#ifdef CONFIG_MODULES
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
+#endif
+
+ page = __its_alloc(pages);
+ if (!page)
+ return NULL;
+
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
+
+ return page;
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * The ITS thunk contains an indirect jump and an int3 instruction so
+ * its size is 3 or 4 bytes depending on the register used. If CFI
+ * paranoid is used then 3 extra bytes are added in the ITS thunk to
+ * complete the fineibt_paranoid caller sequence.
+ */
+ if (cfi_paranoid)
+ size += 3;
+#endif
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
}
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+ /* Paranoid thunk starts 2 bytes before */
+ if (cfi_paranoid)
+ return thunk - 2;
+#endif
+ return thunk;
}
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern s32 __smp_locks[], __smp_locks_end[];
-void *text_poke_early(void *addr, const void *opcode, size_t len);
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
/*
- * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
*/
-static inline bool is_jmp(const u8 opcode)
-{
- return opcode == 0xeb || opcode == 0xe9;
-}
-static void __init_or_module
-recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+/*
+ * Fill the buffer with a single effective instruction of size @len.
+ *
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *next_rip, *tgt_rip;
- s32 n_dspl, o_dspl;
- int repl_len;
+ u8 *target = buf + len;
- if (a->replacementlen != 5)
+ if (!len)
return;
- o_dspl = *(s32 *)(insnbuf + 1);
+ if (len <= ASM_NOP_MAX) {
+ memcpy(buf, x86_nops[len], len);
+ return;
+ }
- /* next_rip of the replacement JMP */
- next_rip = repl_insn + a->replacementlen;
- /* target rip of the replacement JMP */
- tgt_rip = next_rip + o_dspl;
- n_dspl = tgt_rip - orig_insn;
+ if (len < 128) {
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
+ }
- DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
+}
- if (tgt_rip - orig_insn >= 0) {
- if (n_dspl - 2 <= 127)
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
- /* negative offset */
- } else {
- if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
- goto two_byte_jmp;
- else
- goto five_byte_jmp;
+/*
+ * Find the offset of the first non-NOP instruction starting at @offset
+ * but no further than @len.
+ */
+static int skip_nops(u8 *buf, int offset, int len)
+{
+ struct insn insn;
+
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &buf[offset]))
+ break;
+
+ if (!insn_is_nop(&insn))
+ break;
}
-two_byte_jmp:
- n_dspl -= 2;
+ return offset;
+}
- insnbuf[0] = 0xeb;
- insnbuf[1] = (s8)n_dspl;
- add_nops(insnbuf + 2, 3);
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
+{
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;
- repl_len = 2;
- goto done;
+ if (insn_decode_kernel(&insn, &buf[i]))
+ return;
-five_byte_jmp:
- n_dspl -= 5;
+ next = i + insn.length;
- insnbuf[0] = 0xe9;
- *(s32 *)&insnbuf[1] = n_dspl;
+ if (insn_is_nop(&insn)) {
+ int nop = i;
- repl_len = 5;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
-done:
+ next = skip_nops(buf, next, len);
- DPRINTK("final displ: 0x%08x, JMP 0x%lx",
- n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
/*
- * "noinline" to cause control flow change and thus invalidate I$ and
- * cause refetch after modification.
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
+ *
+ * The source offset is:
+ *
+ * src_imm = target - src_next_ip (1)
+ *
+ * and the target offset is:
+ *
+ * dst_imm = target - dst_next_ip (2)
+ *
+ * so rework (1) as an expression for target like:
+ *
+ * target = src_imm + src_next_ip (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
+ *
+ * src_next_ip = src + ip_offset
+ * dst_next_ip = dst + ip_offset (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ * = src_imm + src - dst + ip_offset - ip_offset
+ * = src_imm + src - dst (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
*/
-static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+
+#define apply_reloc_n(n_, p_, d_) \
+ do { \
+ s32 v = *(s##n_ *)(p_); \
+ v += (d_); \
+ BUG_ON((v >> 31) != (v >> (n_-1))); \
+ *(s##n_ *)(p_) = (s##n_)v; \
+ } while (0)
+
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
{
- unsigned long flags;
- int i;
+ switch (n) {
+ case 1: apply_reloc_n(8, ptr, diff); break;
+ case 2: apply_reloc_n(16, ptr, diff); break;
+ case 4: apply_reloc_n(32, ptr, diff); break;
+ default: BUG();
+ }
+}
+
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+ u8 *target = src + offset;
+ /*
+ * If the target is inside the patched block, it's relative to the
+ * block itself and does not need relocation.
+ */
+ return (target < src || target > src + src_len);
+}
+
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ for (int next, i = 0; i < instrlen; i = next) {
+ struct insn insn;
- for (i = 0; i < a->padlen; i++) {
- if (instr[i] != 0x90)
+ if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
return;
+
+ next = i + insn.length;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ if (insn.opcode.bytes[1] < 0x80 ||
+ insn.opcode.bytes[1] > 0x8f)
+ break;
+
+ fallthrough; /* Jcc.d32 */
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case CALL_INSN_OPCODE:
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
+ apply_reloc(insn.immediate.nbytes,
+ buf + i + insn_offset_immediate(&insn),
+ repl - instr);
+ }
+
+ /*
+ * Where possible, convert JMP.d32 into JMP.d8.
+ */
+ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ s32 imm = insn.immediate.value;
+ imm += repl - instr;
+ imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+ if ((imm >> 31) == (imm >> 7)) {
+ buf[i+0] = JMP8_INSN_OPCODE;
+ buf[i+1] = (s8)imm;
+
+ memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+ }
+ }
+ break;
+ }
+
+ if (insn_rip_relative(&insn)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
+ apply_reloc(insn.displacement.nbytes,
+ buf + i + insn_offset_displacement(&insn),
+ repl - instr);
+ }
+ }
}
+}
- local_irq_save(flags);
- add_nops(instr + (a->instrlen - a->padlen), a->padlen);
- local_irq_restore(flags);
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
+
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
- DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
- instr, a->instrlen - a->padlen, a->padlen);
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
}
/*
@@ -365,11 +599,22 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
- struct alt_instr *a;
+ u8 insn_buff[MAX_PATCH_LEN];
u8 *instr, *replacement;
- u8 insnbuf[MAX_PATCH_LEN];
+ struct alt_instr *a, *b;
+
+ DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
- DPRINTK("alt table %px, -> %px", start, end);
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -380,56 +625,1438 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- int insnbuf_sz = 0;
+ unsigned int insn_buff_sz = 0;
+
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
- instr = (u8 *)&a->instr_offset + a->instr_offset;
+ instr = instr_va(a);
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid)) {
- if (a->padlen > 1)
- optimize_nops(a, instr);
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALT_FLAG_NOT is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
- DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
- instr, a->instrlen,
- replacement, a->replacementlen, a->padlen);
+ instr, instr, a->instrlen,
+ replacement, a->replacementlen, a->flags);
+
+ memcpy(insn_buff, replacement, a->replacementlen);
+ insn_buff_sz = a->replacementlen;
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
- memcpy(insnbuf, replacement, a->replacementlen);
- insnbuf_sz = a->replacementlen;
+ for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+ insn_buff[insn_buff_sz] = 0x90;
+
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
+
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+
+ text_poke_early(instr, insn_buff, insn_buff_sz);
+ }
+ kasan_enable_current();
+}
+
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
+
+/*
+ * [CS]{,3} CALL/JMP *%\reg [INT3]*
+ */
+static int emit_indirect(int op, int reg, u8 *bytes, int len)
+{
+ int cs = 0, bp = 0;
+ int i = 0;
+ u8 modrm;
+
+ /*
+ * Set @len to the excess bytes after writing the instruction.
+ */
+ len -= 2 + (reg >= 8);
+ WARN_ON_ONCE(len < 0);
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
/*
- * 0xe8 is a relative jump; fix the offset.
- *
- * Instruction length is checked before the opcode to avoid
- * accessing uninitialized bytes for zero-length replacements.
+ * Additional NOP is better than prefix decode penalty.
*/
- if (a->replacementlen == 5 && *insnbuf == 0xe8) {
- *(s32 *)(insnbuf + 1) += replacement - instr;
- DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
- *(s32 *)(insnbuf + 1),
- (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ if (len <= 3)
+ cs = len;
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ bp = len;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ while (cs--)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ while (bp--)
+ bytes[i++] = 0xcc; /* INT3 */
+
+ return i;
+}
+
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ call_dest,
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ jmp_dest,
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ return false;
+}
+#endif
+
+#endif /* CONFIG_MITIGATION_ITS */
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,lfence when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return emit_call_track_retpoline(addr, insn, reg, bytes);
+
+ return -1;
+ }
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ if (is_jcc32(insn)) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
+ ret = emit_indirect(op, reg, bytes + i, insn->length - i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+ u8 *dest;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ /* Check for cfi_paranoid + ITS */
+ dest = addr + insn.length + insn.immediate.value;
+ if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) {
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+ }
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(addr, bytes, len);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
}
+ }
+}
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
+/*
+ * Rewrite the compiler generated return thunk tail-calls.
+ *
+ * For example, convert:
+ *
+ * JMP __x86_return_thunk
+ *
+ * into:
+ *
+ * RET
+ */
+static int patch_return(void *addr, struct insn *insn, u8 *bytes)
+{
+ int i = 0;
+
+ /* Patch the custom return thunks... */
+ if (cpu_wants_rethunk_at(addr)) {
+ i = JMP32_INSN_SIZE;
+ __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
+ } else {
+ /* ... or patch them out if not needed. */
+ bytes[i++] = RET_INSN_OPCODE;
+ }
+
+ for (; i < insn->length;)
+ bytes[i++] = INT3_INSN_OPCODE;
+ return i;
+}
+
+void __init_or_module noinline apply_returns(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ if (cpu_wants_rethunk())
+ static_call_force_reinit();
+
+ for (s = start; s < end; s++) {
+ void *dest = NULL, *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op == JMP32_INSN_OPCODE)
+ dest = addr + insn.length + insn.immediate.value;
+
+ if (__static_call_fixup(addr, op, dest) ||
+ WARN_ONCE(dest != &__x86_return_thunk,
+ "missing return thunk: %pS-%pS: %*ph",
+ addr, dest, 5, addr))
+ continue;
+
+ DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_return(addr, &insn, bytes);
+ if (len == insn.length) {
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+#else /* !CONFIG_MITIGATION_RETHUNK: */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+#endif /* !CONFIG_MITIGATION_RETHUNK */
+
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
+
+Efault:
+ return false;
+}
+
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
+
+Efault:
+ return false;
+}
+
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
+{
+ u32 poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(!is_endbr(addr)))
+ return;
+
+ DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+}
+
+/*
+ * Generated by: objtool --ibt
+ *
+ * Seal the functions for indirect calls by clobbering the ENDBR instructions
+ * and the kCFI hash value.
+ */
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ poison_endbr(addr);
+ if (IS_ENABLED(CONFIG_FINEIBT))
+ poison_cfi(addr - 16);
+ }
+}
+
+#else /* !CONFIG_X86_KERNEL_IBT: */
+
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_X86_KERNEL_IBT */
+
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+# define __CFI_DEFAULT CFI_AUTO
+#elif defined(CONFIG_CFI)
+# define __CFI_DEFAULT CFI_KCFI
+#else
+# define __CFI_DEFAULT CFI_OFF
+#endif
+
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+static bool cfi_debug __ro_after_init;
- if (a->replacementlen && is_jmp(replacement[0]))
- recompute_jump(a, instr, replacement, insnbuf);
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
+#ifdef CONFIG_CFI
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
- if (a->instrlen > a->replacementlen) {
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- insnbuf_sz += a->instrlen - a->replacementlen;
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
+
+static bool cfi_rand __ro_after_init = true;
+static u32 cfi_seed __ro_after_init;
+
+/*
+ * Re-hash the CFI hash with a boot-time seed while making sure the result is
+ * not a valid ENDBR instruction.
+ */
+static u32 cfi_rehash(u32 hash)
+{
+ hash ^= cfi_seed;
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
+ bool lsb = hash & 1;
+ hash >>= 1;
+ if (lsb)
+ hash ^= 0x80200003;
+ }
+ return hash;
+}
+
+static __init int cfi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
}
- DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
- text_poke_early(instr, insnbuf, insnbuf_sz);
+ if (!strcmp(str, "auto")) {
+ cfi_mode = CFI_AUTO;
+ } else if (!strcmp(str, "off")) {
+ cfi_mode = CFI_OFF;
+ cfi_rand = false;
+ } else if (!strcmp(str, "debug")) {
+ cfi_debug = true;
+ } else if (!strcmp(str, "kcfi")) {
+ cfi_mode = CFI_KCFI;
+ } else if (!strcmp(str, "fineibt")) {
+ cfi_mode = CFI_FINEIBT;
+ } else if (!strcmp(str, "norand")) {
+ cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI: mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("CFI: ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("CFI: ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("CFI: ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
+ } else {
+ pr_err("CFI: Ignoring unknown option (%s).", str);
+ }
+
+ str = next;
}
+
+ return 0;
+}
+early_param("cfi", cfi_parse_cmdline);
+
+/*
+ * kCFI FineIBT
+ *
+ * __cfi_\func: __cfi_\func:
+ * movl $0x12345678,%eax // 5 endbr64 // 4
+ * nop subl $0x12345678,%eax // 5
+ * nop jne.d32,pn \func+3 // 7
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * \func: \func:
+ * endbr64 nopl -42(%rax)
+ *
+ *
+ * caller: caller:
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
+ * je 1f // 2 nop5 // 5
+ * ud2 // 2
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
+ *
+ *
+ * Notably, the FineIBT sequences are crafted such that branches are presumed
+ * non-taken. This is based on Agner Fog's optimization manual, which states:
+ *
+ * "Make conditional jumps most often not taken: The efficiency and throughput
+ * for not-taken branches is better than for taken branches on most
+ * processors. Therefore, it is good to place the most frequent branch first"
+ */
+
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13>
+ * 10: 0f 1f 40 d6 nopl -0x2a(%rax)
+ *
+ * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as
+ * UDB on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %eax \n"
+ "fineibt_preamble_bhi: \n"
+ " cs jne.d32 fineibt_preamble_start+0x13 \n"
+ "#fineibt_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_preamble_end: \n"
+ ".popsection\n"
+);
+
+extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
+extern u8 fineibt_preamble_end[];
+
+#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 0x13
+#define fineibt_preamble_hash 5
+
+/*
+ * <fineibt_caller_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_caller_start: \n"
+ " movl $0x12345678, %eax \n"
+ " lea -0x10(%r11), %r11 \n"
+ ASM_NOP5
+ "fineibt_caller_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
+#define fineibt_caller_hash 1
+
+#define fineibt_caller_jmp (fineibt_caller_size - 2)
+
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax
+ * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " mov $0x12345678, %eax \n"
+ " cmpl -11(%r11), %eax \n"
+ " cs lea -0x10(%r11), %r11 \n"
+ "#fineibt_caller_size: \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " cs call *%r11 \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
+{
+ u8 *p = addr;
+
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
+ return *(u32 *)(addr + 1);
+ }
+
+ return 0; /* invalid hash value */
+}
+
+static u32 decode_caller_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
+ if (p[0] == 0x41 && p[1] == 0xba)
+ return -*(u32 *)(addr + 2);
+
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
+ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
+ return -*(u32 *)(addr + 2);
+
+ return 0; /* invalid hash value */
+}
+
+/* .retpoline_sites */
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
+ * in tact for later usage. Also see decode_caller_hash() and
+ * cfi_rewrite_callers().
+ */
+ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, jmp, 2);
+ }
+
+ return 0;
+}
+
+static int cfi_enable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Re-enable kCFI, undo what cfi_disable_callers() did.
+ */
+ const u8 mov[] = { 0x41, 0xba };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, mov, 2);
+ }
+
+ return 0;
+}
+
+/* .cfi_sites */
+static int cfi_rand_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr, NULL);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ hash = cfi_rehash(hash);
+ text_poke_early(addr + 1, &hash, 4);
+ }
+
+ return 0;
+}
+
+/*
+ * Inline the bhi-arity 1 case:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 49 0f 45 fa cmovne %rax, %rdi
+ * d: 2e 75 03 jne,pn foo+0x3
+ *
+ * foo:
+ * 10: 0f 1f 40 <d6> nopl -42(%rax)
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_bhi1_start: \n"
+ " cmovne %rax, %rdi \n"
+ " cs jne fineibt_bhi1_func + 0x3 \n"
+ "fineibt_bhi1_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_bhi1_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_bhi1_start[];
+extern u8 fineibt_bhi1_end[];
+
+#define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start)
+
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ u8 bytes[MAX_INSN_SIZE];
+
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ text_poke_early(addr + fineibt_preamble_bhi,
+ fineibt_bhi1_start, fineibt_bhi1_size);
+ return;
+ }
+
+ /*
+ * Replace the bytes at fineibt_preamble_bhi with a CALL instruction
+ * that lines up exactly with the end of the preamble, such that the
+ * return address will be foo+0.
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity]
+ */
+ bytes[0] = 0x2e;
+ bytes[1] = 0x2e;
+ __text_gen_insn(bytes + 2, CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi + 2,
+ __bhi_args[arity], CALL_INSN_SIZE);
+
+ text_poke_early(addr + fineibt_preamble_bhi, bytes, 7);
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ int arity;
+ u32 hash;
+
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
+ }
+
+ return 0;
+}
+
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
+ }
+}
+
+/* .retpoline_sites */
+static int cfi_rand_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ hash = -cfi_rehash(hash);
+ text_poke_early(addr + 2, &hash, 4);
+ }
+ }
+
+ return 0;
+}
+
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+ u8 *tmp = its_allocate_thunk(reg);
+ if (tmp)
+ thunk = tmp;
+#endif
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ u8 bytes[20];
+ u32 hash;
+ int ret;
+ u8 op;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+ emit_paranoid_trampoline(addr + fineibt_caller_size,
+ &insn, 11, bytes + fineibt_caller_size);
+ } else {
+ int len = fineibt_paranoid_size - fineibt_paranoid_ind;
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len);
+ if (WARN_ON_ONCE(ret != len))
+ continue;
+ }
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
+ }
+
+ return 0;
+}
+
+#define pr_cfi_debug(X...) if (cfi_debug) pr_info(X)
+
+#define FINEIBT_WARN(_f, _v) \
+ WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v)
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ int ret;
+
+ if (FINEIBT_WARN(fineibt_preamble_size, 20) ||
+ FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) ||
+ FINEIBT_WARN(fineibt_caller_size, 14) ||
+ FINEIBT_WARN(fineibt_paranoid_size, 20))
+ return;
+
+ if (cfi_mode == CFI_AUTO) {
+ cfi_mode = CFI_KCFI;
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
+ cfi_mode = CFI_FINEIBT;
+ }
+ }
+
+ /*
+ * Rewrite the callers to not use the __cfi_ stubs, such that we might
+ * rewrite them. This disables all CFI. If this succeeds but any of the
+ * later stages fails, we're without CFI.
+ */
+ pr_cfi_debug("CFI: disabling all indirect call checking\n");
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (cfi_rand) {
+ if (builtin) {
+ cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
+ pr_cfi_debug("CFI: cfi_seed: 0x%08x\n", cfi_seed);
+
+ pr_cfi_debug("CFI: rehashing all preambles\n");
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ pr_cfi_debug("CFI: rehashing all indirect calls\n");
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+ } else {
+ pr_cfi_debug("CFI: rehashing disabled\n");
+ }
+
+ switch (cfi_mode) {
+ case CFI_OFF:
+ if (builtin)
+ pr_info("CFI: disabled\n");
+ return;
+
+ case CFI_KCFI:
+ pr_cfi_debug("CFI: re-enabling all indirect call checking\n");
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("CFI: Using %sretpoline kCFI\n",
+ cfi_rand ? "rehashed " : "");
+ return;
+
+ case CFI_FINEIBT:
+ pr_cfi_debug("CFI: adding FineIBT to all preambles\n");
+ /* place the FineIBT preamble at func()-16 */
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ /* rewrite the callers to target func()-16 */
+ pr_cfi_debug("CFI: rewriting indirect call sites to use FineIBT\n");
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ /* now that nobody targets func()+0, remove ENDBR there */
+ pr_cfi_debug("CFI: removing old endbr insns\n");
+ cfi_rewrite_endbr(start_cfi, end_cfi);
+
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
+ return;
+
+ default:
+ break;
+ }
+
+err:
+ pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
+}
+
+static inline void poison_hash(void *addr)
+{
+ *(u32 *)addr = 0;
+}
+
+static void poison_cfi(void *addr)
+{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ /*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
+ * __cfi_\func:
+ * nopl -42(%rax)
+ * sub $0, %eax
+ * jne \func+3
+ * \func:
+ * nopl -42(%rax)
+ */
+ poison_endbr(addr);
+ poison_hash(addr + fineibt_preamble_hash);
+ break;
+
+ case CFI_KCFI:
+ /*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
+ * __cfi_\func:
+ * movl $0, %eax
+ * .skip 11, 0x90
+ */
+ poison_hash(addr + 1);
+ break;
+
+ default:
+ break;
+ }
+}
+
+#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
+
+/*
+ * When regs->ip points to a 0xD6 byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * UDB instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_prefix_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_prefix_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+static bool is_paranoid_thunk(unsigned long addr)
+{
+ u32 thunk;
+
+ __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+ return (thunk & 0x00FFFFFF) == 0xfd75d6;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+
+ if (!cfi_paranoid)
+ return false;
+
+ if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+ }
+
+ /*
+ * The cfi_paranoid + ITS thunk combination results in:
+ *
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f7 cmp -11(%r11), %eax
+ * a: 2e 3d 8d 5b f0 cs lea -0x10(%r11), %r11
+ * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
+ *
+ * Where the paranoid_thunk looks like:
+ *
+ * 1d: <d6> udb
+ * __x86_indirect_paranoid_thunk_r11:
+ * 1e: 75 fd jne 1d
+ * __x86_indirect_its_thunk_r11:
+ * 20: 41 ff eb jmp *%r11
+ * 23: cc int3
+ *
+ */
+ if (is_paranoid_thunk(regs->ip)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ regs->ip = *target;
+ return true;
+ }
+
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ if (IS_ENABLED(CONFIG_CFI) && builtin)
+ pr_info("CFI: Using standard kCFI\n");
+}
+
+#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr) { }
+#endif
+
+#endif /* !CONFIG_FINEIBT */
+
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+ return __apply_fineibt(start_retpoline, end_retpoline,
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
}
#ifdef CONFIG_SMP
@@ -509,7 +2136,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
@@ -585,53 +2212,184 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+
+extern void int3_selftest_asm(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR "\n"
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
+ ANNOTATE_NOENDBR "\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
+ ASM_RET
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_ip(void); /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- struct paravirt_patch_site *p;
- char insnbuf[MAX_PATCH_LEN];
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
- for (p = start; p < end; p++) {
- unsigned int used;
+ OPTIMIZER_HIDE_VAR(selftest);
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insnbuf, p->instr, p->len);
- used = pv_ops.init.patch(p->instrtype, insnbuf,
- (unsigned long)p->instr, p->len);
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
- BUG_ON(used > p->len);
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
- /* Pad the rest with nops */
- add_nops(insnbuf + used, p->len - used);
- text_poke_early(p->instr, insnbuf, p->len);
- }
+ if (regs->ip - INT3_INSN_SIZE != selftest)
+ return NOTIFY_DONE;
+
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
+ return NOTIFY_STOP;
+}
+
+/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
+static noinline void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
+ */
+ int3_selftest_asm(&val);
+
+ BUG_ON(val != 0x1234);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
+static __initdata int __alt_reloc_selftest_addr;
+
+extern void __init __alt_reloc_selftest(void *arg);
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+ WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+ /*
+ * Tests text_poke_apply_relocation().
+ *
+ * This has a relative immediate (CALL) in a place other than the first
+ * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ *
+ * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
+ * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
+ *
+ * Getting this wrong will either crash and burn or tickle the WARN
+ * above.
+ */
+ asm_inline volatile (
+ ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ : ASM_CALL_CONSTRAINT
+ : [mem] "m" (__alt_reloc_selftest_addr)
+ : _ASM_ARG1
+ );
}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ u64 ibt;
+
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();
/*
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
- * Ignoring it is worse than a unlikely patching race.
+ * Ignoring it is worse than an unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
+ /*
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
+ */
+ paravirt_set_cap();
+
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
+
+ __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+ __cfi_sites, __cfi_sites_end, true);
+ cfi_debug = false;
+
+ /*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+ apply_returns(__return_sites, __return_sites_end);
+
+ its_fini_core();
+
+ /*
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
+ */
+ callthunks_patch_builtin_calls();
+
apply_alternatives(__alt_instructions, __alt_instructions_end);
+ /*
+ * Seal all functions that do not have their address taken.
+ */
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+
+ ibt_restore(ibt);
+
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
@@ -641,16 +2399,17 @@ void __init alternative_instructions(void)
_text, _etext);
}
- if (!uniproc_patched || num_possible_cpus() == 1)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
-
restart_nmi();
alternatives_patched = 1;
+
+ alt_reloc_selftest();
}
/**
@@ -662,19 +2421,169 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to be protected against NMI or
+ * MCE handlers seeing an inconsistent instruction while you patch.
*/
-void *__init_or_module text_poke_early(void *addr, const void *opcode,
- size_t len)
+void __init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
{
unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_NX) &&
+ is_module_text_address((unsigned long)addr)) {
+ /*
+ * Modules text is marked initially as non-executable, so the
+ * code cannot be running and speculative code-fetches are
+ * prevented. Just change the code.
+ */
+ memcpy(addr, opcode, len);
+ } else {
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ sync_core();
+ local_irq_restore(flags);
+
+ /*
+ * Could also do a CLFLUSH here to speed up CPU recovery; but
+ * that causes hangs on some VIA CPUs.
+ */
+ }
+}
+
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
+
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
+ */
+
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ lass_stac();
+ __inline_memcpy(dst, src, len);
+ lass_clac();
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(const int *)src;
+
+ lass_stac();
+ __inline_memset(dst, c, len);
+ lass_clac();
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
+{
+ bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
+ struct page *pages[2] = {NULL};
+ struct mm_struct *prev_mm;
+ unsigned long flags;
+ pte_t pte, *ptep;
+ spinlock_t *ptl;
+ pgprot_t pgprot;
+
+ /*
+ * While boot memory allocator is running we cannot use struct pages as
+ * they are not yet initialized. There is no way to recover.
+ */
+ BUG_ON(!after_bootmem);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ if (cross_page_boundary)
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ if (cross_page_boundary)
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ /*
+ * If something went wrong, crash and burn since recovery paths are not
+ * implemented.
+ */
+ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
+
+ /*
+ * Map the page without the global bit, as TLB flushing is done with
+ * flush_tlb_mm_range(), which is intended for non-global PTEs.
+ */
+ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
+
+ /*
+ * The lock is not really needed, but this allows to avoid open-coding.
+ */
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
+
+ /*
+ * This must not fail; preallocated in poking_init().
+ */
+ VM_BUG_ON(!ptep);
+
local_irq_save(flags);
- memcpy(addr, opcode, len);
+
+ pte = mk_pte(pages[0], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
+
+ if (cross_page_boundary) {
+ pte = mk_pte(pages[1], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
+ }
+
+ /*
+ * Loading the temporary mm behaves as a compiler barrier, which
+ * guarantees that the PTE will be set at the time memcpy() is done.
+ */
+ prev_mm = use_temporary_mm(text_poke_mm);
+
+ kasan_disable_current();
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
+ kasan_enable_current();
+
+ /*
+ * Ensure that the PTE is only cleared after the instructions of memcpy
+ * were issued by using a compiler barrier.
+ */
+ barrier();
+
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
+ if (cross_page_boundary)
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
+
+ /*
+ * Loading the previous page-table hierarchy requires a serializing
+ * instruction that already allows the core to see the updated version.
+ * Xen-PV is assumed to serialize execution in a similar manner.
+ */
+ unuse_temporary_mm(prev_mm);
+
+ /*
+ * Flushing the TLB might involve IPIs, which would require enabled
+ * IRQs, but not if the mm is not used, as it is in this point.
+ */
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
+ (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
+ PAGE_SHIFT, false);
+
+ if (func == text_poke_memcpy) {
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, src, len));
+ }
+
local_irq_restore(flags);
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
+ pte_unmap_unlock(ptep, ptl);
return addr;
}
@@ -688,47 +2597,107 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
+ *
+ * Note that the caller must ensure that if the modified code is part of a
+ * module, the module would not be removed during poking. This can be achieved
+ * by registering a module notifier, and ordering module removal and patching
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
- unsigned long flags;
- char *vaddr;
- struct page *pages[2];
- int i;
+ lockdep_assert_held(&text_mutex);
- /*
- * While boot memory allocator is runnig we cannot use struct
- * pages as they are not yet initialized.
- */
- BUG_ON(!after_bootmem);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
- lockdep_assert_held(&text_mutex);
+/**
+ * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Context: should only be used by kgdb, which ensures no other core is running,
+ * despite the fact it does not hold the text_mutex.
+ */
+void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+{
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
- if (!core_kernel_text((unsigned long)addr)) {
- pages[0] = vmalloc_to_page(addr);
- pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
- } else {
- pages[0] = virt_to_page(addr);
- WARN_ON(!PageReserved(pages[0]));
- pages[1] = virt_to_page(addr + PAGE_SIZE);
+void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
+ bool core_ok)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
+ return NULL;
+
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
}
- BUG_ON(!pages[0]);
- local_irq_save(flags);
- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
- if (pages[1])
- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- clear_fixmap(FIX_TEXT_POKE0);
- if (pages[1])
- clear_fixmap(FIX_TEXT_POKE1);
- local_flush_tlb();
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
- for (i = 0; i < len; i++)
- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
- local_irq_restore(flags);
+ return addr;
+}
+
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ mutex_lock(&text_mutex);
+ addr = text_poke_copy_locked(addr, opcode, len, false);
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+/**
+ * text_poke_set - memset into (an unused part of) RX memory
+ * @addr: address to modify
+ * @c: the byte to fill the area with
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * This is useful to overwrite unused regions of RX memory with illegal
+ * instructions.
+ */
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
return addr;
}
@@ -737,99 +2706,458 @@ static void do_sync_core(void *info)
sync_core();
}
-static bool bp_patching_in_progress;
-static void *bp_int3_handler, *bp_int3_addr;
+void smp_text_poke_sync_each_cpu(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
+struct smp_text_poke_loc {
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
+ u8 opcode;
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
+ u8 old;
+};
+
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
+ int nr_entries;
+} text_poke_array;
+
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
+
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
+{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
+
+ return true;
+}
-int poke_int3_handler(struct pt_regs *regs)
+static __always_inline void put_text_poke_array(void)
{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ smp_mb__before_atomic();
+ raw_atomic_dec(refs);
+}
+
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
+{
+ return _stext + tpl->rel_addr;
+}
+
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
+{
+ if (tpl_a < text_poke_addr(tpl_b))
+ return -1;
+ if (tpl_a > text_poke_addr(tpl_b))
+ return 1;
+ return 0;
+}
+
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
+{
+ struct smp_text_poke_loc *tpl;
+ int ret = 0;
+ void *ip;
+
+ if (user_mode(regs))
+ return 0;
+
/*
* Having observed our INT3 instruction, we now must observe
- * bp_patching_in_progress.
+ * text_poke_array with non-zero refcount:
*
- * in_progress = TRUE INT3
- * WMB RMB
- * write INT3 if (in_progress)
- *
- * Idem for bp_int3_handler.
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
*/
smp_rmb();
- if (likely(!bp_patching_in_progress))
+ if (!try_get_text_poke_array())
return 0;
- if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
- return 0;
+ /*
+ * Discount the INT3. See smp_text_poke_batch_finish().
+ */
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
+
+ /*
+ * Skip the binary search if there is a single member in the vector.
+ */
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
+ patch_cmp);
+ if (!tpl)
+ goto out_put;
+ } else {
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
+ goto out_put;
+ }
- /* set up the specified breakpoint handler */
- regs->ip = (unsigned long) bp_int3_handler;
+ ip += tpl->len;
- return 1;
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ goto out_put;
+
+ case RET_INSN_OPCODE:
+ int3_emulate_ret(regs);
+ break;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tpl->disp);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
+ break;
+ case 0x70 ... 0x7f: /* Jcc */
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
+ break;
+
+ default:
+ BUG();
+ }
+
+ ret = 1;
+
+out_put:
+ put_text_poke_array();
+ return ret;
}
/**
- * text_poke_bp() -- update instructions on live kernel on SMP
- * @addr: address to patch
- * @opcode: opcode of new instruction
- * @len: length to copy
- * @handler: address to jump to when the temporary breakpoint is hit
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
+ *
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
*
- * Modify multi-byte instruction by using int3 breakpoint on SMP.
- * We completely avoid stop_machine() here, and achieve the
- * synchronization using int3 breakpoint.
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
*
* The way it is done:
- * - add a int3 trap to the address that will be patched
- * - sync cores
- * - update all but the first byte of the patched range
- * - sync cores
- * - replace the first byte (int3) by the first byte of
- * replacing opcode
- * - sync cores
+ * - For each entry in the vector:
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - replace the first byte (INT3) by the first byte of the
+ * replacing opcode
+ * - SMP sync all CPUs
*/
-void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void smp_text_poke_batch_finish(void)
{
- unsigned char int3 = 0xcc;
+ unsigned char int3 = INT3_INSN_OPCODE;
+ unsigned int i;
+ int do_sync;
- bp_int3_handler = handler;
- bp_int3_addr = (u8 *)addr + sizeof(int3);
- bp_patching_in_progress = true;
+ if (!text_poke_array.nr_entries)
+ return;
lockdep_assert_held(&text_mutex);
/*
- * Corresponding read barrier in int3 notifier for making sure the
- * in_progress and handler are correctly ordered wrt. patching.
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
+ */
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
+
+ /*
+ * Function tracing can enable thousands of places that need to be
+ * updated. This can take quite some time, and with full kernel debugging
+ * enabled, this could cause the softlockup watchdog to trigger.
+ * This function gets called every 256 entries added to be patched.
+ * Call cond_resched() here to make sure that other tasks can get scheduled
+ * while processing all the functions being patched.
+ */
+ cond_resched();
+
+ /*
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
- text_poke(addr, &int3, sizeof(int3));
+ /*
+ * First step: add a INT3 trap to the address that will be patched.
+ */
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
+ }
+
+ smp_text_poke_sync_each_cpu();
- on_each_cpu(do_sync_core, NULL, 1);
+ /*
+ * Second step: update all but the first byte of the patched range.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
+
+ if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ if (len == 6) {
+ _new[0] = 0x0f;
+ memcpy(_new + 1, new, 5);
+ new = _new;
+ }
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ do_sync++;
+ }
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
+ }
- if (len - sizeof(int3) > 0) {
- /* patch all but the first byte */
- text_poke((char *)addr + sizeof(int3),
- (const char *) opcode + sizeof(int3),
- len - sizeof(int3));
+ if (do_sync) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ smp_text_poke_sync_each_cpu();
}
- /* patch the first byte */
- text_poke(addr, opcode, sizeof(int3));
+ /*
+ * Third step: replace the first byte (INT3) by the first byte of the
+ * replacing opcode.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
+
+ if (text_poke_array.vec[i].len == 6)
+ byte = 0x0f;
+
+ if (byte == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
+ do_sync++;
+ }
+
+ if (do_sync)
+ smp_text_poke_sync_each_cpu();
- on_each_cpu(do_sync_core, NULL, 1);
/*
- * sync_core() implies an smp_mb() and orders this store against
- * the writing of the new instruction.
+ * Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
- bp_patching_in_progress = false;
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
- return addr;
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
+}
+
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct smp_text_poke_loc *tpl;
+ struct insn insn;
+ int ret, i = 0;
+
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
+ if (len == 6)
+ i = 1;
+ memcpy((void *)tpl->text, opcode+i, len-i);
+ if (!emulate)
+ emulate = opcode;
+
+ ret = insn_decode_kernel(&insn, emulate);
+ BUG_ON(ret < 0);
+
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
+
+ if (is_jcc32(&insn)) {
+ /*
+ * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+ */
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
+ }
+
+ switch (tpl->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ }
+
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ case RET_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ case 0x70 ... 0x7f: /* Jcc */
+ tpl->disp = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
+}
+
+/*
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool text_poke_addr_ordered(void *addr)
+{
+ WARN_ON_ONCE(!addr);
+
+ if (!text_poke_array.nr_entries)
+ return true;
+
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
+
+ return true;
+}
+
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
}
+/**
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Update a single instruction with the vector in the stack, avoiding
+ * dynamically allocated memory. This function should be used when it is
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
+ */
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
+}