From 4e79f0211b473f8e1eab8211a9fd50cc41a3a061 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 21 Sep 2020 00:10:16 +0200 Subject: ARM: p2v: fix handling of LPAE translation in BE mode When running in BE mode on LPAE hardware with a PA-to-VA translation that exceeds 4 GB, we patch bits 39:32 of the offset into the wrong byte of the opcode. So fix that, by rotating the offset in r0 to the right by 8 bits, which will put the 8-bit immediate in bits 31:24. Note that this will also move bit #22 in its correct place when applying the rotation to the constant #0x400000. Fixes: d9a790df8e984 ("ARM: 7883/1: fix mov to mvn conversion in case of 64 bit phys_addr_t and BE") Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/kernel/head.S | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index f8904227e7fd..98c1e68bdfcb 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -671,12 +671,8 @@ ARM_BE8(rev16 ip, ip) ldrcc r7, [r4], #4 @ use branch for delay slot bcc 1b bx lr -#else -#ifdef CONFIG_CPU_ENDIAN_BE8 - moveq r0, #0x00004000 @ set bit 22, mov to mvn instruction #else moveq r0, #0x400000 @ set bit 22, mov to mvn instruction -#endif b 2f 1: ldr ip, [r7, r3] #ifdef CONFIG_CPU_ENDIAN_BE8 @@ -685,7 +681,7 @@ ARM_BE8(rev16 ip, ip) tst ip, #0x000f0000 @ check the rotation field orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 biceq ip, ip, #0x00004000 @ clear bit 22 - orreq ip, ip, r0 @ mask in offset bits 7-0 + orreq ip, ip, r0, ror #8 @ mask in offset bits 7-0 #else bic ip, ip, #0x000000ff tst ip, #0xf00 @ check the rotation field -- cgit From 0b1674638a5c69cbace63278625c199100955490 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 14 Sep 2020 11:23:39 +0300 Subject: ARM: assembler: introduce adr_l, ldr_l and str_l macros Like arm64, ARM supports position independent code sequences that produce symbol references with a greater reach than the ordinary adr/ldr instructions. Since on ARM, the adrl pseudo-instruction is only supported in ARM mode (and not at all when using Clang), having a adr_l macro like we do on arm64 is useful, and increases symmetry as well. Currently, we use open coded instruction sequences involving literals and arithmetic operations. Instead, we can use movw/movt pairs on v7 CPUs, circumventing the D-cache entirely. E.g., on v7+ CPUs, we can emit a PC-relative reference as follows: movw , #:lower16: - (1f + 8) movt , #:upper16: - (1f + 8) 1: add , , pc For older CPUs, we can emit the literal into a subsection, allowing it to be emitted out of line while retaining the ability to perform arithmetic on label offsets. E.g., on pre-v7 CPUs, we can emit a PC-relative reference as follows: ldr , 2f 1: add , , pc .subsection 1 2: .long - (1b + 8) .previous This is allowed by the assembler because, unlike ordinary sections, subsections are combined into a single section in the object file, and so the label references are not true cross-section references that are visible as relocations. (Subsections have been available in binutils since 2004 at least, so they should not cause any issues with older toolchains.) So use the above to implement the macros mov_l, adr_l, ldr_l and str_l, all of which will use movw/movt pairs on v7 and later CPUs, and use PC-relative literals otherwise. Reviewed-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/assembler.h | 84 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index feac2c8b86f2..72627c5fb3b2 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -494,4 +494,88 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) #define _ASM_NOKPROBE(entry) #endif + .macro __adldst_l, op, reg, sym, tmp, c + .if __LINUX_ARM_ARCH__ < 7 + ldr\c \tmp, .La\@ + .subsection 1 + .align 2 +.La\@: .long \sym - .Lpc\@ + .previous + .else + .ifnb \c + THUMB( ittt \c ) + .endif + movw\c \tmp, #:lower16:\sym - .Lpc\@ + movt\c \tmp, #:upper16:\sym - .Lpc\@ + .endif + +#ifndef CONFIG_THUMB2_KERNEL + .set .Lpc\@, . + 8 // PC bias + .ifc \op, add + add\c \reg, \tmp, pc + .else + \op\c \reg, [pc, \tmp] + .endif +#else +.Lb\@: add\c \tmp, \tmp, pc + /* + * In Thumb-2 builds, the PC bias depends on whether we are currently + * emitting into a .arm or a .thumb section. The size of the add opcode + * above will be 2 bytes when emitting in Thumb mode and 4 bytes when + * emitting in ARM mode, so let's use this to account for the bias. + */ + .set .Lpc\@, . + (. - .Lb\@) + + .ifnc \op, add + \op\c \reg, [\tmp] + .endif +#endif + .endm + + /* + * mov_l - move a constant value or [relocated] address into a register + */ + .macro mov_l, dst:req, imm:req + .if __LINUX_ARM_ARCH__ < 7 + ldr \dst, =\imm + .else + movw \dst, #:lower16:\imm + movt \dst, #:upper16:\imm + .endif + .endm + + /* + * adr_l - adr pseudo-op with unlimited range + * + * @dst: destination register + * @sym: name of the symbol + * @cond: conditional opcode suffix + */ + .macro adr_l, dst:req, sym:req, cond + __adldst_l add, \dst, \sym, \dst, \cond + .endm + + /* + * ldr_l - ldr pseudo-op with unlimited range + * + * @dst: destination register + * @sym: name of the symbol + * @cond: conditional opcode suffix + */ + .macro ldr_l, dst:req, sym:req, cond + __adldst_l ldr, \dst, \sym, \dst, \cond + .endm + + /* + * str_l - str pseudo-op with unlimited range + * + * @src: source register + * @sym: name of the symbol + * @tmp: mandatory scratch register + * @cond: conditional opcode suffix + */ + .macro str_l, src:req, sym:req, tmp:req, cond + __adldst_l str, \src, \sym, \tmp, \cond + .endm + #endif /* __ASM_ASSEMBLER_H__ */ -- cgit From 22f2d23098f7d34fc5142531cffb241d14611684 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 14 Sep 2020 11:24:37 +0300 Subject: ARM: module: add support for place relative relocations When using the new adr_l/ldr_l/str_l macros to refer to external symbols from modules, the linker may emit place relative ELF relocations that need to be fixed up by the module loader. So add support for these. Reviewed-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/elf.h | 5 +++++ arch/arm/kernel/module.c | 20 ++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index b078d992414b..0ac62a54b73c 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -51,6 +51,7 @@ typedef struct user_fp elf_fpregset_t; #define R_ARM_NONE 0 #define R_ARM_PC24 1 #define R_ARM_ABS32 2 +#define R_ARM_REL32 3 #define R_ARM_CALL 28 #define R_ARM_JUMP24 29 #define R_ARM_TARGET1 38 @@ -58,11 +59,15 @@ typedef struct user_fp elf_fpregset_t; #define R_ARM_PREL31 42 #define R_ARM_MOVW_ABS_NC 43 #define R_ARM_MOVT_ABS 44 +#define R_ARM_MOVW_PREL_NC 45 +#define R_ARM_MOVT_PREL 46 #define R_ARM_THM_CALL 10 #define R_ARM_THM_JUMP24 30 #define R_ARM_THM_MOVW_ABS_NC 47 #define R_ARM_THM_MOVT_ABS 48 +#define R_ARM_THM_MOVW_PREL_NC 49 +#define R_ARM_THM_MOVT_PREL 50 /* * These are used to set parameters in the core dumps. diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index e15444b25ca0..beac45e89ba6 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -185,14 +185,24 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, *(u32 *)loc |= offset & 0x7fffffff; break; + case R_ARM_REL32: + *(u32 *)loc += sym->st_value - loc; + break; + case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_ABS: + case R_ARM_MOVW_PREL_NC: + case R_ARM_MOVT_PREL: offset = tmp = __mem_to_opcode_arm(*(u32 *)loc); offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); offset = (offset ^ 0x8000) - 0x8000; offset += sym->st_value; - if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) + if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_PREL || + ELF32_R_TYPE(rel->r_info) == R_ARM_MOVW_PREL_NC) + offset -= loc; + if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS || + ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_PREL) offset >>= 16; tmp &= 0xfff0f000; @@ -283,6 +293,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_ABS: + case R_ARM_THM_MOVW_PREL_NC: + case R_ARM_THM_MOVT_PREL: upper = __mem_to_opcode_thumb16(*(u16 *)loc); lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2)); @@ -302,7 +314,11 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, offset = (offset ^ 0x8000) - 0x8000; offset += sym->st_value; - if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) + if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_PREL || + ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVW_PREL_NC) + offset -= loc; + if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS || + ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_PREL) offset >>= 16; upper = (u16)((upper & 0xfbf0) | -- cgit From eae78e1a97201a81a851342ad9659b60f61a3951 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 20 Sep 2020 17:51:55 +0200 Subject: ARM: p2v: move patching code to separate assembler source file Move the phys2virt patching code into a separate .S file before doing some work on it. Suggested-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/kernel/Makefile | 1 + arch/arm/kernel/head.S | 138 ---------------------------------------- arch/arm/kernel/phys2virt.S | 151 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 138 deletions(-) create mode 100644 arch/arm/kernel/phys2virt.S diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 89e5d864e923..9e465efcc8b6 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_PARAVIRT) += paravirt.o head-y := head$(MMUEXT).o obj-$(CONFIG_DEBUG_LL) += debug.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_ARM_PATCH_PHYS_VIRT) += phys2virt.o # This is executed very early using a temporary stack when no memory allocator # nor global data is available. Everything has to be allocated on the stack. diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 98c1e68bdfcb..7e3f36809011 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -586,142 +586,4 @@ ENTRY(fixup_smp) ldmfd sp!, {r4 - r6, pc} ENDPROC(fixup_smp) -#ifdef __ARMEB__ -#define LOW_OFFSET 0x4 -#define HIGH_OFFSET 0x0 -#else -#define LOW_OFFSET 0x0 -#define HIGH_OFFSET 0x4 -#endif - -#ifdef CONFIG_ARM_PATCH_PHYS_VIRT - -/* __fixup_pv_table - patch the stub instructions with the delta between - * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be 16MiB aligned and - * can be expressed by an immediate shifter operand. The stub instruction - * has a form of '(add|sub) rd, rn, #imm'. - */ - __HEAD -__fixup_pv_table: - adr r0, 1f - ldmia r0, {r3-r7} - mvn ip, #0 - subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET - add r4, r4, r3 @ adjust table start address - add r5, r5, r3 @ adjust table end address - add r6, r6, r3 @ adjust __pv_phys_pfn_offset address - add r7, r7, r3 @ adjust __pv_offset address - mov r0, r8, lsr #PAGE_SHIFT @ convert to PFN - str r0, [r6] @ save computed PHYS_OFFSET to __pv_phys_pfn_offset - strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits - mov r6, r3, lsr #24 @ constant for add/sub instructions - teq r3, r6, lsl #24 @ must be 16MiB aligned -THUMB( it ne @ cross section branch ) - bne __error - str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits - b __fixup_a_pv_table -ENDPROC(__fixup_pv_table) - - .align -1: .long . - .long __pv_table_begin - .long __pv_table_end -2: .long __pv_phys_pfn_offset - .long __pv_offset - - .text -__fixup_a_pv_table: - adr r0, 3f - ldr r6, [r0] - add r6, r6, r3 - ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word - ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word - mov r6, r6, lsr #24 - cmn r0, #1 -#ifdef CONFIG_THUMB2_KERNEL - moveq r0, #0x200000 @ set bit 21, mov to mvn instruction - lsls r6, #24 - beq 2f - clz r7, r6 - lsr r6, #24 - lsl r6, r7 - bic r6, #0x0080 - lsrs r7, #1 - orrcs r6, #0x0080 - orr r6, r6, r7, lsl #12 - orr r6, #0x4000 - b 2f -1: add r7, r3 - ldrh ip, [r7, #2] -ARM_BE8(rev16 ip, ip) - tst ip, #0x4000 - and ip, #0x8f00 - orrne ip, r6 @ mask in offset bits 31-24 - orreq ip, r0 @ mask in offset bits 7-0 -ARM_BE8(rev16 ip, ip) - strh ip, [r7, #2] - bne 2f - ldrh ip, [r7] -ARM_BE8(rev16 ip, ip) - bic ip, #0x20 - orr ip, ip, r0, lsr #16 -ARM_BE8(rev16 ip, ip) - strh ip, [r7] -2: cmp r4, r5 - ldrcc r7, [r4], #4 @ use branch for delay slot - bcc 1b - bx lr -#else - moveq r0, #0x400000 @ set bit 22, mov to mvn instruction - b 2f -1: ldr ip, [r7, r3] -#ifdef CONFIG_CPU_ENDIAN_BE8 - @ in BE8, we load data in BE, but instructions still in LE - bic ip, ip, #0xff000000 - tst ip, #0x000f0000 @ check the rotation field - orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 - biceq ip, ip, #0x00004000 @ clear bit 22 - orreq ip, ip, r0, ror #8 @ mask in offset bits 7-0 -#else - bic ip, ip, #0x000000ff - tst ip, #0xf00 @ check the rotation field - orrne ip, ip, r6 @ mask in offset bits 31-24 - biceq ip, ip, #0x400000 @ clear bit 22 - orreq ip, ip, r0 @ mask in offset bits 7-0 -#endif - str ip, [r7, r3] -2: cmp r4, r5 - ldrcc r7, [r4], #4 @ use branch for delay slot - bcc 1b - ret lr -#endif -ENDPROC(__fixup_a_pv_table) - - .align -3: .long __pv_offset - -ENTRY(fixup_pv_table) - stmfd sp!, {r4 - r7, lr} - mov r3, #0 @ no offset - mov r4, r0 @ r0 = table start - add r5, r0, r1 @ r1 = table size - bl __fixup_a_pv_table - ldmfd sp!, {r4 - r7, pc} -ENDPROC(fixup_pv_table) - - .data - .align 2 - .globl __pv_phys_pfn_offset - .type __pv_phys_pfn_offset, %object -__pv_phys_pfn_offset: - .word 0 - .size __pv_phys_pfn_offset, . -__pv_phys_pfn_offset - - .globl __pv_offset - .type __pv_offset, %object -__pv_offset: - .quad 0 - .size __pv_offset, . -__pv_offset -#endif - #include "head-common.S" diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S new file mode 100644 index 000000000000..7c17fbfeeedd --- /dev/null +++ b/arch/arm/kernel/phys2virt.S @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 1994-2002 Russell King + * Copyright (c) 2003 ARM Limited + * All Rights Reserved + */ + +#include +#include +#include +#include + +#ifdef __ARMEB__ +#define LOW_OFFSET 0x4 +#define HIGH_OFFSET 0x0 +#else +#define LOW_OFFSET 0x0 +#define HIGH_OFFSET 0x4 +#endif + +/* + * __fixup_pv_table - patch the stub instructions with the delta between + * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be + * 16MiB aligned. + * + * Called from head.S, which expects the following registers to be preserved: + * r1 = machine no, r2 = atags or dtb, + * r8 = phys_offset, r9 = cpuid, r10 = procinfo + */ + __HEAD +ENTRY(__fixup_pv_table) + adr r0, 1f + ldmia r0, {r3-r7} + mvn ip, #0 + subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET + add r4, r4, r3 @ adjust table start address + add r5, r5, r3 @ adjust table end address + add r6, r6, r3 @ adjust __pv_phys_pfn_offset address + add r7, r7, r3 @ adjust __pv_offset address + mov r0, r8, lsr #PAGE_SHIFT @ convert to PFN + str r0, [r6] @ save computed PHYS_OFFSET to __pv_phys_pfn_offset + strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits + mov r6, r3, lsr #24 @ constant for add/sub instructions + teq r3, r6, lsl #24 @ must be 16MiB aligned + bne 0f + str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits + b __fixup_a_pv_table +0: mov r0, r0 @ deadloop on error + b 0b +ENDPROC(__fixup_pv_table) + + .align +1: .long . + .long __pv_table_begin + .long __pv_table_end +2: .long __pv_phys_pfn_offset + .long __pv_offset + + .text +__fixup_a_pv_table: + adr r0, 3f + ldr r6, [r0] + add r6, r6, r3 + ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word + ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word + mov r6, r6, lsr #24 + cmn r0, #1 +#ifdef CONFIG_THUMB2_KERNEL + moveq r0, #0x200000 @ set bit 21, mov to mvn instruction + lsls r6, #24 + beq 2f + clz r7, r6 + lsr r6, #24 + lsl r6, r7 + bic r6, #0x0080 + lsrs r7, #1 + orrcs r6, #0x0080 + orr r6, r6, r7, lsl #12 + orr r6, #0x4000 + b 2f +1: add r7, r3 + ldrh ip, [r7, #2] +ARM_BE8(rev16 ip, ip) + tst ip, #0x4000 + and ip, #0x8f00 + orrne ip, r6 @ mask in offset bits 31-24 + orreq ip, r0 @ mask in offset bits 7-0 +ARM_BE8(rev16 ip, ip) + strh ip, [r7, #2] + bne 2f + ldrh ip, [r7] +ARM_BE8(rev16 ip, ip) + bic ip, #0x20 + orr ip, ip, r0, lsr #16 +ARM_BE8(rev16 ip, ip) + strh ip, [r7] +2: cmp r4, r5 + ldrcc r7, [r4], #4 @ use branch for delay slot + bcc 1b + bx lr +#else + moveq r0, #0x400000 @ set bit 22, mov to mvn instruction + b 2f +1: ldr ip, [r7, r3] +#ifdef CONFIG_CPU_ENDIAN_BE8 + @ in BE8, we load data in BE, but instructions still in LE + bic ip, ip, #0xff000000 + tst ip, #0x000f0000 @ check the rotation field + orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 + biceq ip, ip, #0x00004000 @ clear bit 22 + orreq ip, ip, r0, ror #8 @ mask in offset bits 7-0 +#else + bic ip, ip, #0x000000ff + tst ip, #0xf00 @ check the rotation field + orrne ip, ip, r6 @ mask in offset bits 31-24 + biceq ip, ip, #0x400000 @ clear bit 22 + orreq ip, ip, r0 @ mask in offset bits 7-0 +#endif + str ip, [r7, r3] +2: cmp r4, r5 + ldrcc r7, [r4], #4 @ use branch for delay slot + bcc 1b + ret lr +#endif +ENDPROC(__fixup_a_pv_table) + + .align +3: .long __pv_offset + +ENTRY(fixup_pv_table) + stmfd sp!, {r4 - r7, lr} + mov r3, #0 @ no offset + mov r4, r0 @ r0 = table start + add r5, r0, r1 @ r1 = table size + bl __fixup_a_pv_table + ldmfd sp!, {r4 - r7, pc} +ENDPROC(fixup_pv_table) + + .data + .align 2 + .globl __pv_phys_pfn_offset + .type __pv_phys_pfn_offset, %object +__pv_phys_pfn_offset: + .word 0 + .size __pv_phys_pfn_offset, . -__pv_phys_pfn_offset + + .globl __pv_offset + .type __pv_offset, %object +__pv_offset: + .quad 0 + .size __pv_offset, . -__pv_offset -- cgit From 4b16421c3e955f440eb45546db6ce33d47f29c78 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 17 Sep 2020 21:29:36 +0300 Subject: ARM: p2v: factor out shared loop processing The ARM and Thumb2 versions of the p2v patching loop have some overlap at the end of the loop, so factor that out. As numeric labels are not required to be unique, and may therefore be ambiguous, use named local labels for the start and end of the loop instead. Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/kernel/phys2virt.S | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index 7c17fbfeeedd..8fb1f7bcc720 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -68,7 +68,7 @@ __fixup_a_pv_table: #ifdef CONFIG_THUMB2_KERNEL moveq r0, #0x200000 @ set bit 21, mov to mvn instruction lsls r6, #24 - beq 2f + beq .Lnext clz r7, r6 lsr r6, #24 lsl r6, r7 @@ -77,8 +77,8 @@ __fixup_a_pv_table: orrcs r6, #0x0080 orr r6, r6, r7, lsl #12 orr r6, #0x4000 - b 2f -1: add r7, r3 + b .Lnext +.Lloop: add r7, r3 ldrh ip, [r7, #2] ARM_BE8(rev16 ip, ip) tst ip, #0x4000 @@ -87,21 +87,17 @@ ARM_BE8(rev16 ip, ip) orreq ip, r0 @ mask in offset bits 7-0 ARM_BE8(rev16 ip, ip) strh ip, [r7, #2] - bne 2f + bne .Lnext ldrh ip, [r7] ARM_BE8(rev16 ip, ip) bic ip, #0x20 orr ip, ip, r0, lsr #16 ARM_BE8(rev16 ip, ip) strh ip, [r7] -2: cmp r4, r5 - ldrcc r7, [r4], #4 @ use branch for delay slot - bcc 1b - bx lr #else moveq r0, #0x400000 @ set bit 22, mov to mvn instruction - b 2f -1: ldr ip, [r7, r3] + b .Lnext +.Lloop: ldr ip, [r7, r3] #ifdef CONFIG_CPU_ENDIAN_BE8 @ in BE8, we load data in BE, but instructions still in LE bic ip, ip, #0xff000000 @@ -117,11 +113,13 @@ ARM_BE8(rev16 ip, ip) orreq ip, ip, r0 @ mask in offset bits 7-0 #endif str ip, [r7, r3] -2: cmp r4, r5 +#endif + +.Lnext: + cmp r4, r5 ldrcc r7, [r4], #4 @ use branch for delay slot - bcc 1b + bcc .Lloop ret lr -#endif ENDPROC(__fixup_a_pv_table) .align -- cgit From 7a94849e81b5c10e71f0a555300313c2789d9b0d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 17 Sep 2020 21:36:46 +0300 Subject: ARM: p2v: factor out BE8 handling The big and little endian versions of the ARM p2v patching routine only differ in the values of the constants, so factor those out into macros so that we only have one version of the logic sequence to maintain. Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/kernel/phys2virt.S | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index 8fb1f7bcc720..5031e5a2e78b 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -95,23 +95,25 @@ ARM_BE8(rev16 ip, ip) ARM_BE8(rev16 ip, ip) strh ip, [r7] #else - moveq r0, #0x400000 @ set bit 22, mov to mvn instruction - b .Lnext -.Lloop: ldr ip, [r7, r3] #ifdef CONFIG_CPU_ENDIAN_BE8 - @ in BE8, we load data in BE, but instructions still in LE - bic ip, ip, #0xff000000 - tst ip, #0x000f0000 @ check the rotation field - orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 - biceq ip, ip, #0x00004000 @ clear bit 22 - orreq ip, ip, r0, ror #8 @ mask in offset bits 7-0 +@ in BE8, we load data in BE, but instructions still in LE +#define PV_BIT22 0x00004000 +#define PV_IMM8_MASK 0xff000000 +#define PV_ROT_MASK 0x000f0000 #else - bic ip, ip, #0x000000ff - tst ip, #0xf00 @ check the rotation field - orrne ip, ip, r6 @ mask in offset bits 31-24 - biceq ip, ip, #0x400000 @ clear bit 22 - orreq ip, ip, r0 @ mask in offset bits 7-0 +#define PV_BIT22 0x00400000 +#define PV_IMM8_MASK 0x000000ff +#define PV_ROT_MASK 0xf00 #endif + + moveq r0, #0x400000 @ set bit 22, mov to mvn instruction + b .Lnext +.Lloop: ldr ip, [r7, r3] + bic ip, ip, #PV_IMM8_MASK + tst ip, #PV_ROT_MASK @ check the rotation field + orrne ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 + biceq ip, ip, #PV_BIT22 @ clear bit 22 + orreq ip, ip, r0 ARM_BE8(, ror #8) @ mask in offset bits 7-0 (or bit 22) str ip, [r7, r3] #endif -- cgit From 0869f3b9da38889faef2ccafcf675c713d4a3aa8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Sep 2020 10:00:24 +0300 Subject: ARM: p2v: drop redundant 'type' argument from __pv_stub We always pass the same value for 'type' so pull it into the __pv_stub macro itself. Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/memory.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 99035b5891ef..eb3c8e6e960a 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -183,14 +183,14 @@ extern const void *__pv_table_begin, *__pv_table_end; #define PHYS_OFFSET ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT) #define PHYS_PFN_OFFSET (__pv_phys_pfn_offset) -#define __pv_stub(from,to,instr,type) \ +#define __pv_stub(from,to,instr) \ __asm__("@ __pv_stub\n" \ "1: " instr " %0, %1, %2\n" \ " .pushsection .pv_table,\"a\"\n" \ " .long 1b\n" \ " .popsection\n" \ : "=r" (to) \ - : "r" (from), "I" (type)) + : "r" (from), "I" (__PV_BITS_31_24)) #define __pv_stub_mov_hi(t) \ __asm__ volatile("@ __pv_stub_mov\n" \ @@ -217,7 +217,7 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x) phys_addr_t t; if (sizeof(phys_addr_t) == 4) { - __pv_stub(x, t, "add", __PV_BITS_31_24); + __pv_stub(x, t, "add"); } else { __pv_stub_mov_hi(t); __pv_add_carry_stub(x, t); @@ -235,7 +235,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x) * assembler expression receives 32 bit argument * in place where 'r' 32 bit operand is expected. */ - __pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24); + __pv_stub((unsigned long) x, t, "sub"); return t; } -- cgit From 2730e8eaa4f2baccc03296e0c5ee109c0673fe5f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 20 Sep 2020 18:23:35 +0200 Subject: ARM: p2v: use relative references in patch site arrays Free up a register in the p2v patching code by switching to relative references, which don't require keeping the phys-to-virt displacement live in a register. Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/memory.h | 6 +++--- arch/arm/kernel/phys2virt.S | 18 +++++++----------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index eb3c8e6e960a..4121662dea5a 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -187,7 +187,7 @@ extern const void *__pv_table_begin, *__pv_table_end; __asm__("@ __pv_stub\n" \ "1: " instr " %0, %1, %2\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b\n" \ + " .long 1b - .\n" \ " .popsection\n" \ : "=r" (to) \ : "r" (from), "I" (__PV_BITS_31_24)) @@ -196,7 +196,7 @@ extern const void *__pv_table_begin, *__pv_table_end; __asm__ volatile("@ __pv_stub_mov\n" \ "1: mov %R0, %1\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b\n" \ + " .long 1b - .\n" \ " .popsection\n" \ : "=r" (t) \ : "I" (__PV_BITS_7_0)) @@ -206,7 +206,7 @@ extern const void *__pv_table_begin, *__pv_table_end; "1: adds %Q0, %1, %2\n" \ " adc %R0, %R0, #0\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b\n" \ + " .long 1b - .\n" \ " .popsection\n" \ : "+r" (y) \ : "r" (x), "I" (__PV_BITS_31_24) \ diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index 5031e5a2e78b..8e4be15e1559 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -58,9 +58,7 @@ ENDPROC(__fixup_pv_table) .text __fixup_a_pv_table: - adr r0, 3f - ldr r6, [r0] - add r6, r6, r3 + adr_l r6, __pv_offset ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word mov r6, r6, lsr #24 @@ -78,7 +76,8 @@ __fixup_a_pv_table: orr r6, r6, r7, lsl #12 orr r6, #0x4000 b .Lnext -.Lloop: add r7, r3 +.Lloop: add r7, r4 + adds r4, #4 ldrh ip, [r7, #2] ARM_BE8(rev16 ip, ip) tst ip, #0x4000 @@ -108,28 +107,25 @@ ARM_BE8(rev16 ip, ip) moveq r0, #0x400000 @ set bit 22, mov to mvn instruction b .Lnext -.Lloop: ldr ip, [r7, r3] +.Lloop: ldr ip, [r7, r4] bic ip, ip, #PV_IMM8_MASK tst ip, #PV_ROT_MASK @ check the rotation field orrne ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 biceq ip, ip, #PV_BIT22 @ clear bit 22 orreq ip, ip, r0 ARM_BE8(, ror #8) @ mask in offset bits 7-0 (or bit 22) - str ip, [r7, r3] + str ip, [r7, r4] + add r4, r4, #4 #endif .Lnext: cmp r4, r5 - ldrcc r7, [r4], #4 @ use branch for delay slot + ldrcc r7, [r4] @ use branch for delay slot bcc .Lloop ret lr ENDPROC(__fixup_a_pv_table) - .align -3: .long __pv_offset - ENTRY(fixup_pv_table) stmfd sp!, {r4 - r7, lr} - mov r3, #0 @ no offset mov r4, r0 @ r0 = table start add r5, r0, r1 @ r1 = table size bl __fixup_a_pv_table -- cgit From 0e3db6c9d7f6fd0ee263325027e8d3fdac5a4c9e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 20 Sep 2020 19:19:32 +0200 Subject: ARM: p2v: simplify __fixup_pv_table() Declutter the code in __fixup_pv_table() by using the new adr_l/str_l macros to take PC relative references to external symbols, and by using the value of PHYS_OFFSET passed in r8 to calculate the p2v offset. Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/kernel/phys2virt.S | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index 8e4be15e1559..be8fb0d89877 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -29,33 +29,27 @@ */ __HEAD ENTRY(__fixup_pv_table) - adr r0, 1f - ldmia r0, {r3-r7} - mvn ip, #0 - subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET - add r4, r4, r3 @ adjust table start address - add r5, r5, r3 @ adjust table end address - add r6, r6, r3 @ adjust __pv_phys_pfn_offset address - add r7, r7, r3 @ adjust __pv_offset address mov r0, r8, lsr #PAGE_SHIFT @ convert to PFN - str r0, [r6] @ save computed PHYS_OFFSET to __pv_phys_pfn_offset - strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits - mov r6, r3, lsr #24 @ constant for add/sub instructions - teq r3, r6, lsl #24 @ must be 16MiB aligned + str_l r0, __pv_phys_pfn_offset, r3 + + adr_l r0, __pv_offset + subs r3, r8, #PAGE_OFFSET @ PHYS_OFFSET - PAGE_OFFSET + mvn ip, #0 + strcc ip, [r0, #HIGH_OFFSET] @ save to __pv_offset high bits + str r3, [r0, #LOW_OFFSET] @ save to __pv_offset low bits + + mov r0, r3, lsr #24 @ constant for add/sub instructions + teq r3, r0, lsl #24 @ must be 16MiB aligned bne 0f - str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits + + adr_l r4, __pv_table_begin + adr_l r5, __pv_table_end b __fixup_a_pv_table + 0: mov r0, r0 @ deadloop on error b 0b ENDPROC(__fixup_pv_table) - .align -1: .long . - .long __pv_table_begin - .long __pv_table_end -2: .long __pv_phys_pfn_offset - .long __pv_offset - .text __fixup_a_pv_table: adr_l r6, __pv_offset -- cgit From e8e00f5afb087912fb3edb225ee373aa6499bb79 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 20 Sep 2020 23:02:25 +0200 Subject: ARM: p2v: switch to MOVW for Thumb2 and ARM/LPAE In preparation for reducing the phys-to-virt minimum relative alignment from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW instructions that can more easily be manipulated to carry a 12-bit immediate. Note that the non-LPAE ARM sequence is not updated: MOVW may not be supported on non-LPAE platforms, and the sequence itself can be updated more easily to apply the 12 bits of displacement. For Thumb2, which has many more versions of opcodes, switch to a sequence that can be patched by the same patching code for both versions. Note that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no rotation bits in their immediate fields, so there is no need to use placeholder constants in the asm blocks. While at it, drop the 'volatile' qualifiers from the asm blocks: the code does not have any side effects that are invisible to the compiler, so it is free to omit these sequences if the outputs are not used. Suggested-by: Russell King Acked-by: Nicolas Pitre Reviewed-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/memory.h | 44 +++++++++---- arch/arm/kernel/phys2virt.S | 147 +++++++++++++++++++++++++++++++++--------- 2 files changed, 148 insertions(+), 43 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 4121662dea5a..ccf55cef6ab9 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -183,6 +183,7 @@ extern const void *__pv_table_begin, *__pv_table_end; #define PHYS_OFFSET ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT) #define PHYS_PFN_OFFSET (__pv_phys_pfn_offset) +#ifndef CONFIG_THUMB2_KERNEL #define __pv_stub(from,to,instr) \ __asm__("@ __pv_stub\n" \ "1: " instr " %0, %1, %2\n" \ @@ -192,25 +193,45 @@ extern const void *__pv_table_begin, *__pv_table_end; : "=r" (to) \ : "r" (from), "I" (__PV_BITS_31_24)) -#define __pv_stub_mov_hi(t) \ - __asm__ volatile("@ __pv_stub_mov\n" \ - "1: mov %R0, %1\n" \ +#define __pv_add_carry_stub(x, y) \ + __asm__("@ __pv_add_carry_stub\n" \ + "0: movw %R0, #0\n" \ + " adds %Q0, %1, %R0, lsl #24\n" \ + "1: mov %R0, %2\n" \ + " adc %R0, %R0, #0\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b - .\n" \ + " .long 0b - ., 1b - .\n" \ " .popsection\n" \ - : "=r" (t) \ - : "I" (__PV_BITS_7_0)) + : "=&r" (y) \ + : "r" (x), "I" (__PV_BITS_7_0) \ + : "cc") + +#else +#define __pv_stub(from,to,instr) \ + __asm__("@ __pv_stub\n" \ + "0: movw %0, #0\n" \ + " lsl %0, #24\n" \ + " " instr " %0, %1, %0\n" \ + " .pushsection .pv_table,\"a\"\n" \ + " .long 0b - .\n" \ + " .popsection\n" \ + : "=&r" (to) \ + : "r" (from)) #define __pv_add_carry_stub(x, y) \ - __asm__ volatile("@ __pv_add_carry_stub\n" \ - "1: adds %Q0, %1, %2\n" \ + __asm__("@ __pv_add_carry_stub\n" \ + "0: movw %R0, #0\n" \ + " lsls %R0, #24\n" \ + " adds %Q0, %1, %R0\n" \ + "1: mvn %R0, #0\n" \ " adc %R0, %R0, #0\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b - .\n" \ + " .long 0b - ., 1b - .\n" \ " .popsection\n" \ - : "+r" (y) \ - : "r" (x), "I" (__PV_BITS_31_24) \ + : "=&r" (y) \ + : "r" (x) \ : "cc") +#endif static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x) { @@ -219,7 +240,6 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x) if (sizeof(phys_addr_t) == 4) { __pv_stub(x, t, "add"); } else { - __pv_stub_mov_hi(t); __pv_add_carry_stub(x, t); } return t; diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index be8fb0d89877..a4e364689663 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 1994-2002 Russell King - * Copyright (c) 2003 ARM Limited + * Copyright (c) 2003, 2020 ARM Limited * All Rights Reserved */ @@ -58,55 +58,140 @@ __fixup_a_pv_table: mov r6, r6, lsr #24 cmn r0, #1 #ifdef CONFIG_THUMB2_KERNEL + @ + @ The Thumb-2 versions of the patchable sequences are + @ + @ phys-to-virt: movw , #offset<31:24> + @ lsl , #24 + @ sub , , + @ + @ virt-to-phys (non-LPAE): movw , #offset<31:24> + @ lsl , #24 + @ add , , + @ + @ virt-to-phys (LPAE): movw , #offset<31:24> + @ lsl , #24 + @ adds , , + @ mov , #offset<39:32> + @ adc , , #0 + @ + @ In the non-LPAE case, all patchable instructions are MOVW + @ instructions, where we need to patch in the offset into the + @ second halfword of the opcode (the 16-bit immediate is encoded + @ as imm4:i:imm3:imm8) + @ + @ 15 11 10 9 4 3 0 15 14 12 11 8 7 0 + @ +-----------+---+-------------+------++---+------+----+------+ + @ MOVW | 1 1 1 1 0 | i | 1 0 0 1 0 0 | imm4 || 0 | imm3 | Rd | imm8 | + @ +-----------+---+-------------+------++---+------+----+------+ + @ + @ In the LPAE case, we also need to patch in the high word of the + @ offset into the immediate field of the MOV instruction, or patch it + @ to a MVN instruction if the offset is negative. In this case, we + @ need to inspect the first halfword of the opcode, to check whether + @ it is MOVW or MOV/MVN, and to perform the MOV to MVN patching if + @ needed. The encoding of the immediate is rather complex for values + @ of i:imm3 != 0b0000, but fortunately, we never need more than 8 lower + @ order bits, which can be patched into imm8 directly (and i:imm3 + @ cleared) + @ + @ 15 11 10 9 5 0 15 14 12 11 8 7 0 + @ +-----------+---+---------------------++---+------+----+------+ + @ MOV | 1 1 1 1 0 | i | 0 0 0 1 0 0 1 1 1 1 || 0 | imm3 | Rd | imm8 | + @ MVN | 1 1 1 1 0 | i | 0 0 0 1 1 0 1 1 1 1 || 0 | imm3 | Rd | imm8 | + @ +-----------+---+---------------------++---+------+----+------+ + @ moveq r0, #0x200000 @ set bit 21, mov to mvn instruction - lsls r6, #24 - beq .Lnext - clz r7, r6 - lsr r6, #24 - lsl r6, r7 - bic r6, #0x0080 - lsrs r7, #1 - orrcs r6, #0x0080 - orr r6, r6, r7, lsl #12 - orr r6, #0x4000 b .Lnext .Lloop: add r7, r4 - adds r4, #4 - ldrh ip, [r7, #2] -ARM_BE8(rev16 ip, ip) - tst ip, #0x4000 - and ip, #0x8f00 - orrne ip, r6 @ mask in offset bits 31-24 - orreq ip, r0 @ mask in offset bits 7-0 -ARM_BE8(rev16 ip, ip) - strh ip, [r7, #2] - bne .Lnext + adds r4, #4 @ clears Z flag +#ifdef CONFIG_ARM_LPAE ldrh ip, [r7] ARM_BE8(rev16 ip, ip) - bic ip, #0x20 - orr ip, ip, r0, lsr #16 + tst ip, #0x200 @ MOVW has bit 9 set, MVN has it clear + bne 0f @ skip to MOVW handling (Z flag is clear) + bic ip, #0x20 @ clear bit 5 (MVN -> MOV) + orr ip, ip, r0, lsr #16 @ MOV -> MVN if offset < 0 ARM_BE8(rev16 ip, ip) strh ip, [r7] + @ Z flag is set +0: +#endif + ldrh ip, [r7, #2] +ARM_BE8(rev16 ip, ip) + and ip, #0xf00 @ clear everything except Rd field + orreq ip, r0 @ Z flag set -> MOV/MVN -> patch in high bits + orrne ip, r6 @ Z flag clear -> MOVW -> patch in low bits +ARM_BE8(rev16 ip, ip) + strh ip, [r7, #2] #else #ifdef CONFIG_CPU_ENDIAN_BE8 @ in BE8, we load data in BE, but instructions still in LE -#define PV_BIT22 0x00004000 +#define PV_BIT24 0x00000001 #define PV_IMM8_MASK 0xff000000 -#define PV_ROT_MASK 0x000f0000 #else -#define PV_BIT22 0x00400000 +#define PV_BIT24 0x01000000 #define PV_IMM8_MASK 0x000000ff -#define PV_ROT_MASK 0xf00 #endif + @ + @ The ARM versions of the patchable sequences are + @ + @ phys-to-virt: sub , , #offset<31:24>, lsl #24 + @ + @ virt-to-phys (non-LPAE): add , , #offset<31:24>, lsl #24 + @ + @ virt-to-phys (LPAE): movw , #offset<31:24> + @ adds , , , lsl #24 + @ mov , #offset<39:32> + @ adc , , #0 + @ + @ In the non-LPAE case, all patchable instructions are ADD or SUB + @ instructions, where we need to patch in the offset into the + @ immediate field of the opcode, which is emitted with the correct + @ rotation value. (The effective value of the immediate is imm12<7:0> + @ rotated right by [2 * imm12<11:8>] bits) + @ + @ 31 28 27 23 22 20 19 16 15 12 11 0 + @ +------+-----------------+------+------+-------+ + @ ADD | cond | 0 0 1 0 1 0 0 0 | Rn | Rd | imm12 | + @ SUB | cond | 0 0 1 0 0 1 0 0 | Rn | Rd | imm12 | + @ MOV | cond | 0 0 1 1 1 0 1 0 | Rn | Rd | imm12 | + @ MVN | cond | 0 0 1 1 1 1 1 0 | Rn | Rd | imm12 | + @ +------+-----------------+------+------+-------+ + @ + @ In the LPAE case, we use a MOVW instruction to carry the low offset + @ word, and patch in the high word of the offset into the immediate + @ field of the subsequent MOV instruction, or patch it to a MVN + @ instruction if the offset is negative. We can distinguish MOVW + @ instructions based on bits 23:22 of the opcode, and ADD/SUB can be + @ distinguished from MOV/MVN (all using the encodings above) using + @ bit 24. + @ + @ 31 28 27 23 22 20 19 16 15 12 11 0 + @ +------+-----------------+------+------+-------+ + @ MOVW | cond | 0 0 1 1 0 0 0 0 | imm4 | Rd | imm12 | + @ +------+-----------------+------+------+-------+ + @ moveq r0, #0x400000 @ set bit 22, mov to mvn instruction b .Lnext .Lloop: ldr ip, [r7, r4] +#ifdef CONFIG_ARM_LPAE + tst ip, #PV_BIT24 @ ADD/SUB have bit 24 clear + beq 1f +ARM_BE8(rev ip, ip) + tst ip, #0xc00000 @ MOVW has bits 23:22 clear + bic ip, ip, #0x400000 @ clear bit 22 + bfc ip, #0, #12 @ clear imm12 field of MOV[W] instruction + orreq ip, ip, r6 @ MOVW -> mask in offset bits 31-24 + orrne ip, ip, r0 @ MOV -> mask in offset bits 7-0 (or bit 22) +ARM_BE8(rev ip, ip) + b 2f +1: +#endif bic ip, ip, #PV_IMM8_MASK - tst ip, #PV_ROT_MASK @ check the rotation field - orrne ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 - biceq ip, ip, #PV_BIT22 @ clear bit 22 - orreq ip, ip, r0 ARM_BE8(, ror #8) @ mask in offset bits 7-0 (or bit 22) + orr ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 +2: str ip, [r7, r4] add r4, r4, #4 #endif -- cgit From 9443076e4330a14ae2c6114307668b98a8293b77 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Sep 2020 11:55:42 +0300 Subject: ARM: p2v: reduce p2v alignment requirement to 2 MiB The ARM kernel's linear map starts at PAGE_OFFSET, which maps to a physical address (PHYS_OFFSET) that is platform specific, and is discovered at boot. Since we don't want to slow down translations between physical and virtual addresses by keeping the offset in a variable in memory, we implement this by patching the code performing the translation, and putting the offset between PAGE_OFFSET and the start of physical RAM directly into the instruction opcodes. As we only patch up to 8 bits of offset, yielding 4 GiB >> 8 == 16 MiB of granularity, we have to round up PHYS_OFFSET to the next multiple if the start of physical RAM is not a multiple of 16 MiB. This wastes some physical RAM, since the memory that was skipped will now live below PAGE_OFFSET, making it inaccessible to the kernel. We can improve this by changing the patchable sequences and the patching logic to carry more bits of offset: 11 bits gives us 4 GiB >> 11 == 2 MiB of granularity, and so we will never waste more than that amount by rounding up the physical start of DRAM to the next multiple of 2 MiB. (Note that 2 MiB granularity guarantees that the linear mapping can be created efficiently, whereas less than 2 MiB may result in the linear mapping needing another level of page tables) This helps Zhen Lei's scenario, where the start of DRAM is known to be occupied. It also helps EFI boot, which relies on the firmware's page allocator to allocate space for the decompressed kernel as low as possible. And if the KASLR patches ever land for 32-bit, it will give us 3 more bits of randomization of the placement of the kernel inside the linear region. For the ARM code path, it simply comes down to using two add/sub instructions instead of one for the carryless version, and patching each of them with the correct immediate depending on the rotation field. For the LPAE calculation, which has to deal with a carry, it patches the MOVW instruction with up to 12 bits of offset (but we only need 11 bits anyway) For the Thumb2 code path, patching more than 11 bits of displacement would be somewhat cumbersome, but the 11 bits we need fit nicely into the second word of the u16[2] opcode, so we simply update the immediate assignment and the left shift to create an addend of the right magnitude. Suggested-by: Zhen Lei Acked-by: Nicolas Pitre Acked-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/Kconfig | 2 +- arch/arm/include/asm/memory.h | 13 ++++++++----- arch/arm/kernel/phys2virt.S | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fe2f17eb2b50..d8d62b8e72e4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -243,7 +243,7 @@ config ARM_PATCH_PHYS_VIRT kernel in system memory. This can only be used with non-XIP MMU kernels where the base - of physical memory is at a 16MB boundary. + of physical memory is at a 2 MiB boundary. Only disable this option if you know that you do not require this feature (eg, building a kernel for a single machine) and diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index ccf55cef6ab9..2611be35f26b 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -173,6 +173,7 @@ extern unsigned long vectors_base; * so that all we need to do is modify the 8-bit constant field. */ #define __PV_BITS_31_24 0x81000000 +#define __PV_BITS_23_16 0x810000 #define __PV_BITS_7_0 0x81 extern unsigned long __pv_phys_pfn_offset; @@ -187,16 +188,18 @@ extern const void *__pv_table_begin, *__pv_table_end; #define __pv_stub(from,to,instr) \ __asm__("@ __pv_stub\n" \ "1: " instr " %0, %1, %2\n" \ + "2: " instr " %0, %0, %3\n" \ " .pushsection .pv_table,\"a\"\n" \ - " .long 1b - .\n" \ + " .long 1b - ., 2b - .\n" \ " .popsection\n" \ : "=r" (to) \ - : "r" (from), "I" (__PV_BITS_31_24)) + : "r" (from), "I" (__PV_BITS_31_24), \ + "I"(__PV_BITS_23_16)) #define __pv_add_carry_stub(x, y) \ __asm__("@ __pv_add_carry_stub\n" \ "0: movw %R0, #0\n" \ - " adds %Q0, %1, %R0, lsl #24\n" \ + " adds %Q0, %1, %R0, lsl #20\n" \ "1: mov %R0, %2\n" \ " adc %R0, %R0, #0\n" \ " .pushsection .pv_table,\"a\"\n" \ @@ -210,7 +213,7 @@ extern const void *__pv_table_begin, *__pv_table_end; #define __pv_stub(from,to,instr) \ __asm__("@ __pv_stub\n" \ "0: movw %0, #0\n" \ - " lsl %0, #24\n" \ + " lsl %0, #21\n" \ " " instr " %0, %1, %0\n" \ " .pushsection .pv_table,\"a\"\n" \ " .long 0b - .\n" \ @@ -221,7 +224,7 @@ extern const void *__pv_table_begin, *__pv_table_end; #define __pv_add_carry_stub(x, y) \ __asm__("@ __pv_add_carry_stub\n" \ "0: movw %R0, #0\n" \ - " lsls %R0, #24\n" \ + " lsls %R0, #21\n" \ " adds %Q0, %1, %R0\n" \ "1: mvn %R0, #0\n" \ " adc %R0, %R0, #0\n" \ diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S index a4e364689663..fb53db78fe78 100644 --- a/arch/arm/kernel/phys2virt.S +++ b/arch/arm/kernel/phys2virt.S @@ -21,7 +21,7 @@ /* * __fixup_pv_table - patch the stub instructions with the delta between * PHYS_OFFSET and PAGE_OFFSET, which is assumed to be - * 16MiB aligned. + * 2 MiB aligned. * * Called from head.S, which expects the following registers to be preserved: * r1 = machine no, r2 = atags or dtb, @@ -38,8 +38,8 @@ ENTRY(__fixup_pv_table) strcc ip, [r0, #HIGH_OFFSET] @ save to __pv_offset high bits str r3, [r0, #LOW_OFFSET] @ save to __pv_offset low bits - mov r0, r3, lsr #24 @ constant for add/sub instructions - teq r3, r0, lsl #24 @ must be 16MiB aligned + mov r0, r3, lsr #21 @ constant for add/sub instructions + teq r3, r0, lsl #21 @ must be 2 MiB aligned bne 0f adr_l r4, __pv_table_begin @@ -55,22 +55,21 @@ __fixup_a_pv_table: adr_l r6, __pv_offset ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word - mov r6, r6, lsr #24 cmn r0, #1 #ifdef CONFIG_THUMB2_KERNEL @ @ The Thumb-2 versions of the patchable sequences are @ - @ phys-to-virt: movw , #offset<31:24> - @ lsl , #24 + @ phys-to-virt: movw , #offset<31:21> + @ lsl , #21 @ sub , , @ - @ virt-to-phys (non-LPAE): movw , #offset<31:24> - @ lsl , #24 + @ virt-to-phys (non-LPAE): movw , #offset<31:21> + @ lsl , #21 @ add , , @ - @ virt-to-phys (LPAE): movw , #offset<31:24> - @ lsl , #24 + @ virt-to-phys (LPAE): movw , #offset<31:21> + @ lsl , #21 @ adds , , @ mov , #offset<39:32> @ adc , , #0 @@ -102,6 +101,9 @@ __fixup_a_pv_table: @ +-----------+---+---------------------++---+------+----+------+ @ moveq r0, #0x200000 @ set bit 21, mov to mvn instruction + lsrs r3, r6, #29 @ isolate top 3 bits of displacement + ubfx r6, r6, #21, #8 @ put bits 28:21 into the MOVW imm8 field + bfi r6, r3, #12, #3 @ put bits 31:29 into the MOVW imm3 field b .Lnext .Lloop: add r7, r4 adds r4, #4 @ clears Z flag @@ -129,20 +131,24 @@ ARM_BE8(rev16 ip, ip) @ in BE8, we load data in BE, but instructions still in LE #define PV_BIT24 0x00000001 #define PV_IMM8_MASK 0xff000000 +#define PV_IMMR_MSB 0x00080000 #else #define PV_BIT24 0x01000000 #define PV_IMM8_MASK 0x000000ff +#define PV_IMMR_MSB 0x00000800 #endif @ @ The ARM versions of the patchable sequences are @ @ phys-to-virt: sub , , #offset<31:24>, lsl #24 + @ sub , , #offset<23:16>, lsl #16 @ @ virt-to-phys (non-LPAE): add , , #offset<31:24>, lsl #24 + @ add , , #offset<23:16>, lsl #16 @ - @ virt-to-phys (LPAE): movw , #offset<31:24> - @ adds , , , lsl #24 + @ virt-to-phys (LPAE): movw , #offset<31:20> + @ adds , , , lsl #20 @ mov , #offset<39:32> @ adc , , #0 @ @@ -174,6 +180,9 @@ ARM_BE8(rev16 ip, ip) @ +------+-----------------+------+------+-------+ @ moveq r0, #0x400000 @ set bit 22, mov to mvn instruction + mov r3, r6, lsr #16 @ put offset bits 31-16 into r3 + mov r6, r6, lsr #24 @ put offset bits 31-24 into r6 + and r3, r3, #0xf0 @ only keep offset bits 23-20 in r3 b .Lnext .Lloop: ldr ip, [r7, r4] #ifdef CONFIG_ARM_LPAE @@ -183,14 +192,17 @@ ARM_BE8(rev ip, ip) tst ip, #0xc00000 @ MOVW has bits 23:22 clear bic ip, ip, #0x400000 @ clear bit 22 bfc ip, #0, #12 @ clear imm12 field of MOV[W] instruction - orreq ip, ip, r6 @ MOVW -> mask in offset bits 31-24 + orreq ip, ip, r6, lsl #4 @ MOVW -> mask in offset bits 31-24 + orreq ip, ip, r3, lsr #4 @ MOVW -> mask in offset bits 23-20 orrne ip, ip, r0 @ MOV -> mask in offset bits 7-0 (or bit 22) ARM_BE8(rev ip, ip) b 2f 1: #endif + tst ip, #PV_IMMR_MSB @ rotation value >= 16 ? bic ip, ip, #PV_IMM8_MASK - orr ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 + orreq ip, ip, r6 ARM_BE8(, lsl #24) @ mask in offset bits 31-24 + orrne ip, ip, r3 ARM_BE8(, lsl #24) @ mask in offset bits 23-20 2: str ip, [r7, r4] add r4, r4, #4 -- cgit