summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/alternative.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/alternative.c')
-rw-r--r--arch/x86/kernel/alternative.c1797
1 files changed, 1262 insertions, 535 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a5ead6a6d233..28518371d8bf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,35 +1,18 @@
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt
-#include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/mmu_context.h>
#include <linux/perf_event.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/stringify.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <linux/stop_machine.h>
-#include <linux/slab.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/bsearch.h>
-#include <linux/sync_core.h>
+#include <linux/execmem.h>
+
#include <asm/text-patching.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/mce.h>
-#include <asm/nmi.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/paravirt.h>
-#include <asm/asm-prototypes.h>
+#include <asm/insn-eval.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
+#include <asm/nmi.h>
int __read_mostly alternatives_patched;
@@ -44,7 +27,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
#define DA_ENDBR 0x08
#define DA_SMP 0x10
-static unsigned int __initdata_or_module debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
@@ -123,6 +106,213 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
#endif
};
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
+#endif
+
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
+#endif
+static void *its_page;
+static unsigned int its_offset;
+struct its_array its_pages;
+
+static void *__its_alloc(struct its_array *pages)
+{
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
+
+ return no_free_ptr(page);
+}
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int offset = 0;
+ int i = 0;
+
+#ifdef CONFIG_FINEIBT
+ if (cfi_paranoid) {
+ /*
+ * When ITS uses indirect branch thunk the fineibt_paranoid
+ * caller sequence doesn't fit in the caller site. So put the
+ * remaining part of the sequence (UDB + JNE) into the ITS
+ * thunk.
+ */
+ bytes[i++] = 0xd6; /* UDB */
+ bytes[i++] = 0x75; /* JNE */
+ bytes[i++] = 0xfd;
+
+ offset = 1;
+ }
+#endif
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* JMP *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk + offset;
+}
+
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
+ execmem_free(page);
+ }
+ kfree(mod->arch.its_pages.pages);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+ struct its_array *pages = &its_pages;
+ void *page;
+
+#ifdef CONFIG_MODULES
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
+#endif
+
+ page = __its_alloc(pages);
+ if (!page)
+ return NULL;
+
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
+
+ return page;
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * The ITS thunk contains an indirect jump and an int3 instruction so
+ * its size is 3 or 4 bytes depending on the register used. If CFI
+ * paranoid is used then 3 extra bytes are added in the ITS thunk to
+ * complete the fineibt_paranoid caller sequence.
+ */
+ if (cfi_paranoid)
+ size += 3;
+#endif
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+ /* Paranoid thunk starts 2 bytes before */
+ if (cfi_paranoid)
+ return thunk - 2;
+#endif
+ return thunk;
+}
+
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
+
+/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
/*
* Fill the buffer with a single effective instruction of size @len.
*
@@ -132,67 +322,40 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void __init_or_module add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *target = instr + len;
+ u8 *target = buf + len;
if (!len)
return;
if (len <= ASM_NOP_MAX) {
- memcpy(instr, x86_nops[len], len);
+ memcpy(buf, x86_nops[len], len);
return;
}
if (len < 128) {
- __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
- instr += JMP8_INSN_SIZE;
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
} else {
- __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
- instr += JMP32_INSN_SIZE;
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
}
- for (;instr < target; instr++)
- *instr = INT3_INSN_OPCODE;
-}
-
-extern s32 __retpoline_sites[], __retpoline_sites_end[];
-extern s32 __return_sites[], __return_sites_end[];
-extern s32 __cfi_sites[], __cfi_sites_end[];
-extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern s32 __smp_locks[], __smp_locks_end[];
-void text_poke_early(void *addr, const void *opcode, size_t len);
-
-/*
- * Matches NOP and NOPL, not any of the other possible NOPs.
- */
-static bool insn_is_nop(struct insn *insn)
-{
- /* Anything NOP, but no REP NOP */
- if (insn->opcode.bytes[0] == 0x90 &&
- (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
- return true;
-
- /* NOPL */
- if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
- return true;
-
- /* TODO: more nops */
-
- return false;
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
/*
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
{
struct insn insn;
for (; offset < len; offset += insn.length) {
- if (insn_decode_kernel(&insn, &instr[offset]))
+ if (insn_decode_kernel(&insn, &buf[offset]))
break;
if (!insn_is_nop(&insn))
@@ -203,55 +366,31 @@ static int skip_nops(u8 *instr, int offset, int len)
}
/*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool __init_or_module
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
- int i = *next - insn->length;
-
- switch (insn->opcode.bytes[0]) {
- case JMP8_INSN_OPCODE:
- case JMP32_INSN_OPCODE:
- *prev = i;
- *target = *next + insn->immediate.value;
- return false;
- }
-
- if (insn_is_nop(insn)) {
- int nop = i;
-
- *next = skip_nops(instr, *next, len);
- if (*target && *next == *target)
- nop = *prev;
-
- add_nop(instr + nop, *next - nop);
- DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
- return true;
- }
-
- *target = 0;
- return false;
-}
-
-/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- int prev, target = 0;
-
for (int next, i = 0; i < len; i = next) {
struct insn insn;
- if (insn_decode_kernel(&insn, &instr[i]))
+ if (insn_decode_kernel(&insn, &buf[i]))
return;
next = i + insn.length;
- __optimize_nops(instr, len, &insn, &next, &prev, &target);
+ if (insn_is_nop(&insn)) {
+ int nop = i;
+
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
+
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
}
}
@@ -325,12 +464,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-static void __init_or_module noinline
-apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- int prev, target = 0;
-
- for (int next, i = 0; i < len; i = next) {
+ for (int next, i = 0; i < instrlen; i = next) {
struct insn insn;
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -338,9 +474,6 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
next = i + insn.length;
- if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
- continue;
-
switch (insn.opcode.bytes[0]) {
case 0x0f:
if (insn.opcode.bytes[1] < 0x80 ||
@@ -352,10 +485,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
case JMP8_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case CALL_INSN_OPCODE:
- if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
apply_reloc(insn.immediate.nbytes,
buf + i + insn_offset_immediate(&insn),
- src - dest);
+ repl - instr);
}
/*
@@ -363,7 +496,7 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
*/
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
s32 imm = insn.immediate.value;
- imm += src - dest;
+ imm += repl - instr;
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
if ((imm >> 31) == (imm >> 7)) {
buf[i+0] = JMP8_INSN_OPCODE;
@@ -376,15 +509,83 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
if (insn_rip_relative(&insn)) {
- if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
apply_reloc(insn.displacement.nbytes,
buf + i + insn_offset_displacement(&insn),
- src - dest);
+ repl - instr);
}
}
}
}
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
+
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -398,11 +599,22 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
- struct alt_instr *a;
- u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a, *b;
DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -413,9 +625,20 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- int insn_buff_sz = 0;
+ unsigned int insn_buff_sz = 0;
+
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
- instr = (u8 *)&a->instr_offset + a->instr_offset;
+ instr = instr_va(a);
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -427,24 +650,28 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops(instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
- DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen);
+ replacement, a->replacementlen, a->flags);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -452,6 +679,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
}
+
+ kasan_enable_current();
}
static inline bool is_jcc32(struct insn *insn)
@@ -460,23 +689,36 @@ static inline bool is_jcc32(struct insn *insn)
return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
}
-#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
- * CALL/JMP *%\reg
+ * [CS]{,3} CALL/JMP *%\reg [INT3]*
*/
-static int emit_indirect(int op, int reg, u8 *bytes)
+static int emit_indirect(int op, int reg, u8 *bytes, int len)
{
+ int cs = 0, bp = 0;
int i = 0;
u8 modrm;
+ /*
+ * Set @len to the excess bytes after writing the instruction.
+ */
+ len -= 2 + (reg >= 8);
+ WARN_ON_ONCE(len < 0);
+
switch (op) {
case CALL_INSN_OPCODE:
modrm = 0x10; /* Reg = 2; CALL r/m */
+ /*
+ * Additional NOP is better than prefix decode penalty.
+ */
+ if (len <= 3)
+ cs = len;
break;
case JMP32_INSN_OPCODE:
modrm = 0x20; /* Reg = 4; JMP r/m */
+ bp = len;
break;
default:
@@ -484,6 +726,9 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return -1;
}
+ while (cs--)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
if (reg >= 8) {
bytes[i++] = 0x41; /* REX.B prefix */
reg -= 8;
@@ -495,10 +740,14 @@ static int emit_indirect(int op, int reg, u8 *bytes)
bytes[i++] = 0xff; /* opcode */
bytes[i++] = modrm;
+ while (bp--)
+ bytes[i++] = 0xcc; /* INT3 */
+
return i;
}
-static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
{
u8 op = insn->opcode.bytes[0];
int i = 0;
@@ -519,7 +768,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
switch (op) {
case CALL_INSN_OPCODE:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_call_thunk_array[reg],
+ call_dest,
CALL_INSN_SIZE);
i += CALL_INSN_SIZE;
break;
@@ -527,7 +776,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
case JMP32_INSN_OPCODE:
clang_jcc:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_jump_thunk_array[reg],
+ jmp_dest,
JMP32_INSN_SIZE);
i += JMP32_INSN_SIZE;
break;
@@ -542,6 +791,48 @@ clang_jcc:
return i;
}
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ return false;
+}
+#endif
+
+#endif /* CONFIG_MITIGATION_ITS */
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -616,20 +907,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
bytes[i++] = 0xe8; /* LFENCE */
}
- ret = emit_indirect(op, reg, bytes + i);
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
+ ret = emit_indirect(op, reg, bytes + i, insn->length - i);
if (ret < 0)
return ret;
i += ret;
- /*
- * The compiler is supposed to EMIT an INT3 after every unconditional
- * JMP instruction due to AMD BTC. However, if the compiler is too old
- * or SLS isn't enabled, we still need an INT3 after indirect JMPs
- * even on Intel.
- */
- if (op == JMP32_INSN_OPCODE && i < insn->length)
- bytes[i++] = INT3_INSN_OPCODE;
-
for (; i < insn->length;)
bytes[i++] = BYTES_NOP1;
@@ -649,6 +940,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
int len, ret;
u8 bytes[16];
u8 op1, op2;
+ u8 *dest;
ret = insn_decode_kernel(&insn, addr);
if (WARN_ON_ONCE(ret < 0))
@@ -658,8 +950,19 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
op2 = insn.opcode.bytes[1];
switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
+ /* Check for cfi_paranoid + ITS */
+ dest = addr + insn.length + insn.immediate.value;
+ if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) {
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+ }
break;
case 0x0f: /* escape */
@@ -677,7 +980,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
- optimize_nops(bytes, len);
+ optimize_nops(addr, bytes, len);
DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
@@ -685,7 +988,22 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
}
}
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
+
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
/*
* Rewrite the compiler generated return thunk tail-calls.
@@ -703,7 +1021,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
int i = 0;
/* Patch the custom return thunks... */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (cpu_wants_rethunk_at(addr)) {
i = JMP32_INSN_SIZE;
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
} else {
@@ -720,13 +1038,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- /*
- * Do not patch out the default return thunks if those needed are the
- * ones generated by the compiler.
- */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
- (x86_return_thunk == __x86_return_thunk))
- return;
+ if (cpu_wants_rethunk())
+ static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
@@ -761,32 +1074,53 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
}
}
}
-#else
+#else /* !CONFIG_MITIGATION_RETHUNK: */
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETHUNK */
+#endif /* !CONFIG_MITIGATION_RETHUNK */
-#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr);
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
-static void __init_or_module poison_endbr(void *addr, bool warn)
+Efault:
+ return false;
+}
+
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
{
- u32 endbr, poison = gen_endbr_poison();
+ u32 endbr;
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
- return;
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
- if (!is_endbr(endbr)) {
- WARN_ON_ONCE(warn);
+Efault:
+ return false;
+}
+
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
+{
+ u32 poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(!is_endbr(addr)))
return;
- }
DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
@@ -811,28 +1145,74 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- poison_endbr(addr, true);
+ poison_endbr(addr);
if (IS_ENABLED(CONFIG_FINEIBT))
poison_cfi(addr - 16);
}
}
-#else
+#else /* !CONFIG_X86_KERNEL_IBT: */
void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
-#endif /* CONFIG_X86_KERNEL_IBT */
+#endif /* !CONFIG_X86_KERNEL_IBT */
-#ifdef CONFIG_FINEIBT
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+# define __CFI_DEFAULT CFI_AUTO
+#elif defined(CONFIG_CFI)
+# define __CFI_DEFAULT CFI_KCFI
+#else
+# define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+static bool cfi_debug __ro_after_init;
+
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
+#ifdef CONFIG_CFI
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
+
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -843,7 +1223,7 @@ static u32 cfi_seed __ro_after_init;
static u32 cfi_rehash(u32 hash)
{
hash ^= cfi_seed;
- while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
bool lsb = hash & 1;
hash >>= 1;
if (lsb)
@@ -865,18 +1245,39 @@ static __init int cfi_parse_cmdline(char *str)
}
if (!strcmp(str, "auto")) {
- cfi_mode = CFI_DEFAULT;
+ cfi_mode = CFI_AUTO;
} else if (!strcmp(str, "off")) {
cfi_mode = CFI_OFF;
cfi_rand = false;
+ } else if (!strcmp(str, "debug")) {
+ cfi_debug = true;
} else if (!strcmp(str, "kcfi")) {
cfi_mode = CFI_KCFI;
} else if (!strcmp(str, "fineibt")) {
cfi_mode = CFI_FINEIBT;
} else if (!strcmp(str, "norand")) {
cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI: mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("CFI: ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("CFI: ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("CFI: ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
} else {
- pr_err("Ignoring unknown cfi option (%s).", str);
+ pr_err("CFI: Ignoring unknown option (%s).", str);
}
str = next;
@@ -891,10 +1292,8 @@ early_param("cfi", cfi_parse_cmdline);
*
* __cfi_\func: __cfi_\func:
* movl $0x12345678,%eax // 5 endbr64 // 4
- * nop subl $0x12345678,%r10d // 7
- * nop jz 1f // 2
- * nop ud2 // 2
- * nop 1: nop // 1
+ * nop subl $0x12345678,%eax // 5
+ * nop jne.d32,pn \func+3 // 7
* nop
* nop
* nop
@@ -902,39 +1301,70 @@ early_param("cfi", cfi_parse_cmdline);
* nop
* nop
* nop
+ * nop
+ * nop
+ * \func: \func:
+ * endbr64 nopl -42(%rax)
*
*
* caller: caller:
- * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
- * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
- * je 1f // 2 nop4 // 4
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
+ * je 1f // 2 nop5 // 5
* ud2 // 2
- * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
+ *
+ *
+ * Notably, the FineIBT sequences are crafted such that branches are presumed
+ * non-taken. This is based on Agner Fog's optimization manual, which states:
*
+ * "Make conditional jumps most often not taken: The efficiency and throughput
+ * for not-taken branches is better than for taken branches on most
+ * processors. Therefore, it is good to place the most frequent branch first"
*/
-asm( ".pushsection .rodata \n"
- "fineibt_preamble_start: \n"
- " endbr64 \n"
- " subl $0x12345678, %r10d \n"
- " je fineibt_preamble_end \n"
- " ud2 \n"
- " nop \n"
- "fineibt_preamble_end: \n"
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13>
+ * 10: 0f 1f 40 d6 nopl -0x2a(%rax)
+ *
+ * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as
+ * UDB on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %eax \n"
+ "fineibt_preamble_bhi: \n"
+ " cs jne.d32 fineibt_preamble_start+0x13 \n"
+ "#fineibt_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_preamble_end: \n"
".popsection\n"
);
extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
-#define fineibt_preamble_hash 7
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 0x13
+#define fineibt_preamble_hash 5
+/*
+ * <fineibt_caller_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
+ */
asm( ".pushsection .rodata \n"
"fineibt_caller_start: \n"
- " movl $0x12345678, %r10d \n"
- " sub $16, %r11 \n"
- ASM_NOP4
+ " movl $0x12345678, %eax \n"
+ " lea -0x10(%r11), %r11 \n"
+ ASM_NOP5
"fineibt_caller_end: \n"
".popsection \n"
);
@@ -943,17 +1373,66 @@ extern u8 fineibt_caller_start[];
extern u8 fineibt_caller_end[];
#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
-#define fineibt_caller_hash 2
+#define fineibt_caller_hash 1
#define fineibt_caller_jmp (fineibt_caller_size - 2)
-static u32 decode_preamble_hash(void *addr)
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax
+ * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " mov $0x12345678, %eax \n"
+ " cmpl -11(%r11), %eax \n"
+ " cs lea -0x10(%r11), %r11 \n"
+ "#fineibt_caller_size: \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " cs call *%r11 \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
{
u8 *p = addr;
- /* b8 78 56 34 12 mov $0x12345678,%eax */
- if (p[0] == 0xb8)
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
return *(u32 *)(addr + 1);
+ }
return 0; /* invalid hash value */
}
@@ -962,11 +1441,11 @@ static u32 decode_caller_hash(void *addr)
{
u8 *p = addr;
- /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
if (p[0] == 0x41 && p[1] == 0xba)
return -*(u32 *)(addr + 2);
- /* e8 0c 78 56 34 12 jmp.d8 +12 */
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
return -*(u32 *)(addr + 2);
@@ -1031,7 +1510,7 @@ static int cfi_rand_preamble(s32 *start, s32 *end)
void *addr = (void *)s + *s;
u32 hash;
- hash = decode_preamble_hash(addr);
+ hash = decode_preamble_hash(addr, NULL);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
@@ -1043,15 +1522,87 @@ static int cfi_rand_preamble(s32 *start, s32 *end)
return 0;
}
+/*
+ * Inline the bhi-arity 1 case:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 49 0f 45 fa cmovne %rax, %rdi
+ * d: 2e 75 03 jne,pn foo+0x3
+ *
+ * foo:
+ * 10: 0f 1f 40 <d6> nopl -42(%rax)
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_bhi1_start: \n"
+ " cmovne %rax, %rdi \n"
+ " cs jne fineibt_bhi1_func + 0x3 \n"
+ "fineibt_bhi1_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_bhi1_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_bhi1_start[];
+extern u8 fineibt_bhi1_end[];
+
+#define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start)
+
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ u8 bytes[MAX_INSN_SIZE];
+
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ text_poke_early(addr + fineibt_preamble_bhi,
+ fineibt_bhi1_start, fineibt_bhi1_size);
+ return;
+ }
+
+ /*
+ * Replace the bytes at fineibt_preamble_bhi with a CALL instruction
+ * that lines up exactly with the end of the preamble, such that the
+ * return address will be foo+0.
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity]
+ */
+ bytes[0] = 0x2e;
+ bytes[1] = 0x2e;
+ __text_gen_insn(bytes + 2, CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi + 2,
+ __bhi_args[arity], CALL_INSN_SIZE);
+
+ text_poke_early(addr + fineibt_preamble_bhi, bytes, 7);
+}
+
static int cfi_rewrite_preamble(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ int arity;
u32 hash;
- hash = decode_preamble_hash(addr);
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
@@ -1059,6 +1610,13 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end)
text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
}
return 0;
@@ -1071,7 +1629,10 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- poison_endbr(addr+16, false);
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
}
}
@@ -1095,40 +1656,101 @@ static int cfi_rand_callers(s32 *start, s32 *end)
return 0;
}
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+ u8 *tmp = its_allocate_thunk(reg);
+ if (tmp)
+ thunk = tmp;
+#endif
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
static int cfi_rewrite_callers(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ struct insn insn;
+ u8 bytes[20];
u32 hash;
+ int ret;
+ u8 op;
addr -= fineibt_caller_size;
hash = decode_caller_hash(addr);
- if (hash) {
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
}
- /* rely on apply_retpolines() */
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+ emit_paranoid_trampoline(addr + fineibt_caller_size,
+ &insn, 11, bytes + fineibt_caller_size);
+ } else {
+ int len = fineibt_paranoid_size - fineibt_paranoid_ind;
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len);
+ if (WARN_ON_ONCE(ret != len))
+ continue;
+ }
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
}
return 0;
}
+#define pr_cfi_debug(X...) if (cfi_debug) pr_info(X)
+
+#define FINEIBT_WARN(_f, _v) \
+ WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v)
+
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
s32 *start_cfi, s32 *end_cfi, bool builtin)
{
int ret;
- if (WARN_ONCE(fineibt_preamble_size != 16,
- "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
+ if (FINEIBT_WARN(fineibt_preamble_size, 20) ||
+ FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) ||
+ FINEIBT_WARN(fineibt_caller_size, 14) ||
+ FINEIBT_WARN(fineibt_paranoid_size, 20))
return;
- if (cfi_mode == CFI_DEFAULT) {
+ if (cfi_mode == CFI_AUTO) {
cfi_mode = CFI_KCFI;
- if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
cfi_mode = CFI_FINEIBT;
+ }
}
/*
@@ -1136,54 +1758,71 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
* rewrite them. This disables all CFI. If this succeeds but any of the
* later stages fails, we're without CFI.
*/
+ pr_cfi_debug("CFI: disabling all indirect call checking\n");
ret = cfi_disable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
+ pr_cfi_debug("CFI: cfi_seed: 0x%08x\n", cfi_seed);
+ pr_cfi_debug("CFI: rehashing all preambles\n");
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
goto err;
+ pr_cfi_debug("CFI: rehashing all indirect calls\n");
ret = cfi_rand_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
+ } else {
+ pr_cfi_debug("CFI: rehashing disabled\n");
}
switch (cfi_mode) {
case CFI_OFF:
if (builtin)
- pr_info("Disabling CFI\n");
+ pr_info("CFI: disabled\n");
return;
case CFI_KCFI:
+ pr_cfi_debug("CFI: re-enabling all indirect call checking\n");
ret = cfi_enable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
if (builtin)
- pr_info("Using kCFI\n");
+ pr_info("CFI: Using %sretpoline kCFI\n",
+ cfi_rand ? "rehashed " : "");
return;
case CFI_FINEIBT:
+ pr_cfi_debug("CFI: adding FineIBT to all preambles\n");
/* place the FineIBT preamble at func()-16 */
ret = cfi_rewrite_preamble(start_cfi, end_cfi);
if (ret)
goto err;
/* rewrite the callers to target func()-16 */
+ pr_cfi_debug("CFI: rewriting indirect call sites to use FineIBT\n");
ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
/* now that nobody targets func()+0, remove ENDBR there */
+ pr_cfi_debug("CFI: removing old endbr insns\n");
cfi_rewrite_endbr(start_cfi, end_cfi);
- if (builtin)
- pr_info("Using FineIBT CFI\n");
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
return;
default:
@@ -1201,22 +1840,42 @@ static inline void poison_hash(void *addr)
static void poison_cfi(void *addr)
{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
switch (cfi_mode) {
case CFI_FINEIBT:
/*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
* __cfi_\func:
- * osp nopl (%rax)
- * subl $0, %r10d
- * jz 1f
- * ud2
- * 1: nop
+ * nopl -42(%rax)
+ * sub $0, %eax
+ * jne \func+3
+ * \func:
+ * nopl -42(%rax)
*/
- poison_endbr(addr, false);
+ poison_endbr(addr);
poison_hash(addr + fineibt_preamble_hash);
break;
case CFI_KCFI:
/*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
* __cfi_\func:
* movl $0, %eax
* .skip 11, 0x90
@@ -1229,18 +1888,168 @@ static void poison_cfi(void *addr)
}
}
-#else
+#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
+
+/*
+ * When regs->ip points to a 0xD6 byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * UDB instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_prefix_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_prefix_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+static bool is_paranoid_thunk(unsigned long addr)
+{
+ u32 thunk;
+
+ __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+ return (thunk & 0x00FFFFFF) == 0xfd75d6;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+
+ if (!cfi_paranoid)
+ return false;
+
+ if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+ }
+
+ /*
+ * The cfi_paranoid + ITS thunk combination results in:
+ *
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f7 cmp -11(%r11), %eax
+ * a: 2e 3d 8d 5b f0 cs lea -0x10(%r11), %r11
+ * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
+ *
+ * Where the paranoid_thunk looks like:
+ *
+ * 1d: <d6> udb
+ * __x86_indirect_paranoid_thunk_r11:
+ * 1e: 75 fd jne 1d
+ * __x86_indirect_its_thunk_r11:
+ * 20: 41 ff eb jmp *%r11
+ * 23: cc int3
+ *
+ */
+ if (is_paranoid_thunk(regs->ip)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ regs->ip = *target;
+ return true;
+ }
+
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
s32 *start_cfi, s32 *end_cfi, bool builtin)
{
+ if (IS_ENABLED(CONFIG_CFI) && builtin)
+ pr_info("CFI: Using standard kCFI\n");
}
#ifdef CONFIG_X86_KERNEL_IBT
static void poison_cfi(void *addr) { }
#endif
-#endif
+#endif /* !CONFIG_FINEIBT */
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
s32 *start_cfi, s32 *end_cfi)
@@ -1403,46 +2212,6 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
-{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
- }
-}
-
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{
- struct paravirt_patch_site *p;
- char insn_buff[MAX_PATCH_LEN];
-
- for (p = start; p < end; p++) {
- unsigned int used;
-
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insn_buff, p->instr, p->len);
- used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
-
- BUG_ON(used > p->len);
-
- /* Pad the rest with nops */
- add_nops(insn_buff + used, p->len - used);
- text_poke_early(p->instr, insn_buff, p->len);
- }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
/*
* Self-test for the INT3 based CALL emulation code.
*
@@ -1454,21 +2223,34 @@ extern struct paravirt_patch_site __start_parainstructions[],
* See entry_{32,64}.S for more details.
*/
-/*
- * We define the int3_magic() function in assembly to control the calling
- * convention such that we can 'call' it from assembly.
- */
+extern void int3_selftest_asm(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR "\n"
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
-extern void int3_magic(unsigned int *ptr); /* defined in asm */
+extern void int3_selftest_callee(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
-" .type int3_magic, @function\n"
-"int3_magic:\n"
- ANNOTATE_NOENDBR
-" movl $1, (%" _ASM_ARG1 ")\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
+ ANNOTATE_NOENDBR "\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
-" .size int3_magic, .-int3_magic\n"
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
" .popsection\n"
);
@@ -1477,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- unsigned long selftest = (unsigned long)&int3_selftest_ip;
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
@@ -1492,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data
if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
- int3_emulate_call(regs, (unsigned long)&int3_magic);
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
return NOTIFY_STOP;
}
@@ -1508,19 +2290,11 @@ static noinline void __init int3_selftest(void)
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
- * Basically: int3_magic(&val); but really complicated :-)
- *
- * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
- * notifier above will emulate CALL for us.
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
*/
- asm volatile ("int3_selftest_ip:\n\t"
- ANNOTATE_NOENDBR
- " int3; nop; nop; nop; nop\n\t"
- : ASM_CALL_CONSTRAINT
- : __ASM_SEL_RAW(a, D) (&val)
- : "memory");
+ int3_selftest_asm(&val);
- BUG_ON(val != 1);
+ BUG_ON(val != 0x1234);
unregister_die_notifier(&int3_exception_nb);
}
@@ -1536,7 +2310,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg)
static noinline void __init alt_reloc_selftest(void)
{
/*
- * Tests apply_relocation().
+ * Tests text_poke_apply_relocation().
*
* This has a relative immediate (CALL) in a place other than the first
* instruction and additionally on x86_64 we get a RIP-relative LEA:
@@ -1549,7 +2323,7 @@ static noinline void __init alt_reloc_selftest(void)
*/
asm_inline volatile (
ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
- : /* output */
+ : ASM_CALL_CONSTRAINT
: [mem] "m" (__alt_reloc_selftest_addr)
: _ASM_ARG1
);
@@ -1557,6 +2331,8 @@ static noinline void __init alt_reloc_selftest(void)
void __init alternative_instructions(void)
{
+ u64 ibt;
+
int3_selftest();
/*
@@ -1578,30 +2354,17 @@ void __init alternative_instructions(void)
*/
/*
- * Paravirt patching and alternative patching can be combined to
- * replace a function call with a short direct code sequence (e.g.
- * by setting a constant return value instead of doing that in an
- * external function).
- * In order to make this work the following sequence is required:
- * 1. set (artificial) features depending on used paravirt
- * functions which can later influence alternative patching
- * 2. apply paravirt patching (generally replacing an indirect
- * function call with a direct one)
- * 3. apply alternative patching (e.g. replacing a direct function
- * call with a custom code sequence)
- * Doing paravirt patching after alternative patching would clobber
- * the optimization of the custom code with a function call again.
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
*/
paravirt_set_cap();
- /*
- * First patch paravirt functions, such that we overwrite the indirect
- * call with the direct call.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
__cfi_sites, __cfi_sites_end, true);
+ cfi_debug = false;
/*
* Rewrite the retpolines, must be done before alternatives since
@@ -1610,23 +2373,23 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
- /*
- * Then patch alternatives, such that those paravirt calls that are in
- * alternatives can be overwritten by their immediate fragments.
- */
- apply_alternatives(__alt_instructions, __alt_instructions_end);
+ its_fini_core();
/*
- * Now all calls are established. Apply the call thunks if
- * required.
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
*/
callthunks_patch_builtin_calls();
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
/*
* Seal all functions that do not have their address taken.
*/
apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ ibt_restore(ibt);
+
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
@@ -1677,8 +2440,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
@@ -1687,83 +2450,33 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
-typedef struct {
- struct mm_struct *mm;
-} temp_mm_state_t;
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
/*
- * Using a temporary mm allows to set temporary mappings that are not accessible
- * by other CPUs. Such mappings are needed to perform sensitive memory writes
- * that override the kernel memory protections (e.g., W^X), without exposing the
- * temporary page-table mappings that are required for these write operations to
- * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
- * mapping is torn down.
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
*
- * Context: The temporary mm needs to be used exclusively by a single core. To
- * harden security IRQs must be disabled while the temporary mm is
- * loaded, thereby preventing interrupt handler bugs from overriding
- * the kernel memory protection.
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
*/
-static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
-{
- temp_mm_state_t temp_state;
-
- lockdep_assert_irqs_disabled();
-
- /*
- * Make sure not to be in TLB lazy mode, as otherwise we'll end up
- * with a stale address space WITHOUT being in lazy mode after
- * restoring the previous mm.
- */
- if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm(smp_processor_id());
-
- temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
- switch_mm_irqs_off(NULL, mm, current);
-
- /*
- * If breakpoints are enabled, disable them while the temporary mm is
- * used. Userspace might set up watchpoints on addresses that are used
- * in the temporary mm, which would lead to wrong signals being sent or
- * crashes.
- *
- * Note that breakpoints are not disabled selectively, which also causes
- * kernel breakpoints (e.g., perf's) to be disabled. This might be
- * undesirable, but still seems reasonable as the code that runs in the
- * temporary mm should be short.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_disable();
-
- return temp_state;
-}
-
-static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
-{
- lockdep_assert_irqs_disabled();
- switch_mm_irqs_off(NULL, prev_state.mm, current);
-
- /*
- * Restore the breakpoints if they were disabled before the temporary mm
- * was loaded.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_restore();
-}
-
-__ro_after_init struct mm_struct *poking_mm;
-__ro_after_init unsigned long poking_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
- memcpy(dst, src, len);
+ lass_stac();
+ __inline_memcpy(dst, src, len);
+ lass_clac();
}
static void text_poke_memset(void *dst, const void *src, size_t len)
{
int c = *(const int *)src;
- memset(dst, c, len);
+ lass_stac();
+ __inline_memset(dst, c, len);
+ lass_clac();
}
typedef void text_poke_f(void *dst, const void *src, size_t len);
@@ -1772,7 +2485,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
- temp_mm_state_t prev;
+ struct mm_struct *prev_mm;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
@@ -1809,7 +2522,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
- ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
@@ -1819,21 +2532,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
local_irq_save(flags);
pte = mk_pte(pages[0], pgprot);
- set_pte_at(poking_mm, poking_addr, ptep, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
- set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
- prev = use_temporary_mm(poking_mm);
+ prev_mm = use_temporary_mm(text_poke_mm);
kasan_disable_current();
- func((u8 *)poking_addr + offset_in_page(addr), src, len);
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@@ -1842,22 +2555,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
*/
barrier();
- pte_clear(poking_mm, poking_addr, ptep);
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
if (cross_page_boundary)
- pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
- unuse_temporary_mm(prev);
+ unuse_temporary_mm(prev_mm);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
- flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
@@ -1888,7 +2601,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
- * trough a mutex.
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
@@ -1993,7 +2706,7 @@ static void do_sync_core(void *info)
sync_core();
}
-void text_poke_sync(void)
+void smp_text_poke_sync_each_cpu(void)
{
on_each_cpu(do_sync_core, NULL, 1);
}
@@ -2003,64 +2716,66 @@ void text_poke_sync(void)
* this thing. When len == 6 everything is prefixed with 0x0f and we map
* opcode to Jcc.d8, using len to distinguish.
*/
-struct text_poke_loc {
+struct smp_text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
s32 disp;
u8 len;
u8 opcode;
- const u8 text[POKE_MAX_OPCODE_SIZE];
- /* see text_poke_bp_batch() */
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
u8 old;
};
-struct bp_patching_desc {
- struct text_poke_loc *vec;
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
int nr_entries;
- atomic_t refs;
-};
+} text_poke_array;
-static struct bp_patching_desc bp_desc;
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
-static __always_inline
-struct bp_patching_desc *try_get_desc(void)
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
- if (!raw_atomic_inc_not_zero(&desc->refs))
- return NULL;
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
- return desc;
+ return true;
}
-static __always_inline void put_desc(void)
+static __always_inline void put_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
smp_mb__before_atomic();
- raw_atomic_dec(&desc->refs);
+ raw_atomic_dec(refs);
}
-static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
{
- return _stext + tp->rel_addr;
+ return _stext + tpl->rel_addr;
}
-static __always_inline int patch_cmp(const void *key, const void *elt)
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
{
- struct text_poke_loc *tp = (struct text_poke_loc *) elt;
-
- if (key < text_poke_addr(tp))
+ if (tpl_a < text_poke_addr(tpl_b))
return -1;
- if (key > text_poke_addr(tp))
+ if (tpl_a > text_poke_addr(tpl_b))
return 1;
return 0;
}
-noinstr int poke_int3_handler(struct pt_regs *regs)
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
{
- struct bp_patching_desc *desc;
- struct text_poke_loc *tp;
+ struct smp_text_poke_loc *tpl;
int ret = 0;
void *ip;
@@ -2069,41 +2784,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
/*
* Having observed our INT3 instruction, we now must observe
- * bp_desc with non-zero refcount:
+ * text_poke_array with non-zero refcount:
*
- * bp_desc.refs = 1 INT3
- * WMB RMB
- * write INT3 if (bp_desc.refs != 0)
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
*/
smp_rmb();
- desc = try_get_desc();
- if (!desc)
+ if (!try_get_text_poke_array())
return 0;
/*
- * Discount the INT3. See text_poke_bp_batch().
+ * Discount the INT3. See smp_text_poke_batch_finish().
*/
ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(desc->nr_entries > 1)) {
- tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
- sizeof(struct text_poke_loc),
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
patch_cmp);
- if (!tp)
+ if (!tpl)
goto out_put;
} else {
- tp = desc->vec;
- if (text_poke_addr(tp) != ip)
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
goto out_put;
}
- ip += tp->len;
+ ip += tpl->len;
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
/*
* Someone poked an explicit INT3, they'll want to handle it,
@@ -2116,16 +2830,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->disp);
+ int3_emulate_call(regs, (long)ip + tpl->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->disp);
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
break;
case 0x70 ... 0x7f: /* Jcc */
- int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
break;
default:
@@ -2135,51 +2849,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
ret = 1;
out_put:
- put_desc();
+ put_text_poke_array();
return ret;
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
-
/**
- * text_poke_bp_batch() -- update instructions on live kernel on SMP
- * @tp: vector of instructions to patch
- * @nr_entries: number of entries in the vector
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
+ *
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
*
- * Modify multi-byte instruction by using int3 breakpoint on SMP.
- * We completely avoid stop_machine() here, and achieve the
- * synchronization using int3 breakpoint.
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
*
* The way it is done:
* - For each entry in the vector:
- * - add a int3 trap to the address that will be patched
- * - sync cores
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
* - For each entry in the vector:
* - update all but the first byte of the patched range
- * - sync cores
+ * - SMP sync all CPUs
* - For each entry in the vector:
- * - replace the first byte (int3) by the first byte of
+ * - replace the first byte (INT3) by the first byte of the
* replacing opcode
- * - sync cores
+ * - SMP sync all CPUs
*/
-static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+void smp_text_poke_batch_finish(void)
{
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
- lockdep_assert_held(&text_mutex);
+ if (!text_poke_array.nr_entries)
+ return;
- bp_desc.vec = tp;
- bp_desc.nr_entries = nr_entries;
+ lockdep_assert_held(&text_mutex);
/*
- * Corresponds to the implicit memory barrier in try_get_desc() to
- * ensure reading a non-zero refcount provides up to date bp_desc data.
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
*/
- atomic_set_release(&bp_desc.refs, 1);
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@@ -2192,33 +2905,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
cond_resched();
/*
- * Corresponding read barrier in int3 notifier for making sure the
- * nr_entries and handler are correctly ordered wrt. patching.
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
/*
- * First step: add a int3 trap to the address that will be patched.
+ * First step: add a INT3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++) {
- tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
- text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
}
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Second step: update all but the first byte of the patched range.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
- u8 _new[POKE_MAX_OPCODE_SIZE+1];
- const u8 *new = tp[i].text;
- int len = tp[i].len;
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
- text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
if (len == 6) {
@@ -2227,7 +2940,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
new = _new;
}
- text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
@@ -2258,7 +2971,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
- perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
}
if (do_sync) {
@@ -2267,63 +2980,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
}
/*
- * Third step: replace the first byte (int3) by the first byte of
+ * Third step: replace the first byte (INT3) by the first byte of the
* replacing opcode.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 byte = tp[i].text[0];
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
- if (tp[i].len == 6)
+ if (text_poke_array.vec[i].len == 6)
byte = 0x0f;
if (byte == INT3_INSN_OPCODE)
continue;
- text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
- if (!atomic_dec_and_test(&bp_desc.refs))
- atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
+
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
}
-static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
- const void *opcode, size_t len, const void *emulate)
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
+ struct smp_text_poke_loc *tpl;
struct insn insn;
int ret, i = 0;
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
if (len == 6)
i = 1;
- memcpy((void *)tp->text, opcode+i, len-i);
+ memcpy((void *)tpl->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
BUG_ON(ret < 0);
- tp->rel_addr = addr - (void *)_stext;
- tp->len = len;
- tp->opcode = insn.opcode.bytes[0];
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
if (is_jcc32(&insn)) {
/*
* Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
*/
- tp->opcode = insn.opcode.bytes[1] - 0x10;
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
@@ -2332,14 +3061,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* next instruction can be padded with INT3.
*/
for (i = insn.length; i < len; i++)
- BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
break;
default:
BUG_ON(len != insn.length);
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -2348,21 +3077,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
case 0x70 ... 0x7f: /* Jcc */
- tp->disp = insn.immediate.value;
+ tpl->disp = insn.immediate.value;
break;
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP8_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP32_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
break;
default: /* unknown instruction */
@@ -2373,51 +3102,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
/*
- * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
* early if needed.
*/
-static bool tp_order_fail(void *addr)
+static bool text_poke_addr_ordered(void *addr)
{
- struct text_poke_loc *tp;
+ WARN_ON_ONCE(!addr);
- if (!tp_vec_nr)
- return false;
-
- if (!addr) /* force */
+ if (!text_poke_array.nr_entries)
return true;
- tp = &tp_vec[tp_vec_nr - 1];
- if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
- return true;
-
- return false;
-}
-
-static void text_poke_flush(void *addr)
-{
- if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
- text_poke_bp_batch(tp_vec, tp_vec_nr);
- tp_vec_nr = 0;
- }
-}
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
-void text_poke_finish(void)
-{
- text_poke_flush(NULL);
+ return true;
}
-void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc *tp;
-
- text_poke_flush(addr);
-
- tp = &tp_vec[tp_vec_nr++];
- text_poke_loc_init(tp, addr, opcode, len, emulate);
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
}
/**
- * text_poke_bp() -- update instructions on live kernel on SMP
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
@@ -2425,12 +3153,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
- * not possible to allocate memory.
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
*/
-void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp;
-
- text_poke_loc_init(&tp, addr, opcode, len, emulate);
- text_poke_bp_batch(&tp, 1);
+ smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
}