summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig8
-rw-r--r--arch/h8300/Kconfig1
-rw-r--r--arch/h8300/include/asm/hash.h53
-rw-r--r--arch/m68k/Kconfig.cpu1
-rw-r--r--arch/m68k/include/asm/hash.h59
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/microblaze/include/asm/hash.h81
7 files changed, 204 insertions, 0 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index b16e74e4b5af..d794384a0404 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -598,6 +598,14 @@ config HAVE_STACK_VALIDATION
Architecture supports the 'objtool check' host tool command, which
performs compile-time stack metadata validation.
+config HAVE_ARCH_HASH
+ bool
+ default n
+ help
+ If this is set, the architecture provides an <asm/hash.h>
+ file which provides platform-specific implementations of some
+ functions in <linux/hash.h> or fs/namei.c.
+
#
# ABI hall of shame
#
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index aa232de2d4bc..3ae852507e57 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_HASH
select CPU_NO_EFFICIENT_FFS
config RWSEM_GENERIC_SPINLOCK
diff --git a/arch/h8300/include/asm/hash.h b/arch/h8300/include/asm/hash.h
new file mode 100644
index 000000000000..04cfbd2bd850
--- /dev/null
+++ b/arch/h8300/include/asm/hash.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * The later H8SX models have a 32x32-bit multiply, but the H8/300H
+ * and H8S have only 16x16->32. Since it's tolerably compact, this is
+ * basically an inlined version of the __mulsi3 code. Since the inputs
+ * are not expected to be small, it's also simplfied by skipping the
+ * early-out checks.
+ *
+ * (Since neither CPU has any multi-bit shift instructions, a
+ * shift-and-add version is a non-starter.)
+ *
+ * TODO: come up with an arch-specific version of the hashing in fs/namei.c,
+ * since that is heavily dependent on rotates. Which, as mentioned, suck
+ * horribly on H8.
+ */
+
+#if defined(CONFIG_CPU_H300H) || defined(CONFIG_CPU_H8S)
+
+#define HAVE_ARCH__HASH_32 1
+
+/*
+ * Multiply by k = 0x61C88647. Fitting this into three registers requires
+ * one extra instruction, but reducing register pressure will probably
+ * make that back and then some.
+ *
+ * GCC asm note: %e1 is the high half of operand %1, while %f1 is the
+ * low half. So if %1 is er4, then %e1 is e4 and %f1 is r4.
+ *
+ * This has been designed to modify x in place, since that's the most
+ * common usage, but preserve k, since hash_64() makes two calls in
+ * quick succession.
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+ u32 temp;
+
+ asm( "mov.w %e1,%f0"
+ "\n mulxu.w %f2,%0" /* klow * xhigh */
+ "\n mov.w %f0,%e1" /* The extra instruction */
+ "\n mov.w %f1,%f0"
+ "\n mulxu.w %e2,%0" /* khigh * xlow */
+ "\n add.w %e1,%f0"
+ "\n mulxu.w %f2,%1" /* klow * xlow */
+ "\n add.w %f0,%e1"
+ : "=&r" (temp), "=r" (x)
+ : "%r" (GOLDEN_RATIO_32), "1" (x));
+ return x;
+}
+
+#endif
+#endif /* _ASM_HASH_H */
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 8ace920ca24a..967260f2eb1c 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -41,6 +41,7 @@ config M68000
select CPU_HAS_NO_UNALIGNED
select GENERIC_CSUM
select CPU_NO_EFFICIENT_FFS
+ select HAVE_ARCH_HASH
help
The Freescale (was Motorola) 68000 CPU is the first generation of
the well known M68K family of processors. The CPU core as well as
diff --git a/arch/m68k/include/asm/hash.h b/arch/m68k/include/asm/hash.h
new file mode 100644
index 000000000000..6407af84a994
--- /dev/null
+++ b/arch/m68k/include/asm/hash.h
@@ -0,0 +1,59 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * If CONFIG_M68000=y (original mc68000/010), this file is #included
+ * to work around the lack of a MULU.L instruction.
+ */
+
+#define HAVE_ARCH__HASH_32 1
+/*
+ * While it would be legal to substitute a different hash operation
+ * entirely, let's keep it simple and just use an optimized multiply
+ * by GOLDEN_RATIO_32 = 0x61C88647.
+ *
+ * The best way to do that appears to be to multiply by 0x8647 with
+ * shifts and adds, and use mulu.w to multiply the high half by 0x61C8.
+ *
+ * Because the 68000 has multi-cycle shifts, this addition chain is
+ * chosen to minimise the shift distances.
+ *
+ * Despite every attempt to spoon-feed it simple operations, GCC
+ * 6.1.1 doggedly insists on doing annoying things like converting
+ * "lsl.l #2,<reg>" (12 cycles) to two adds (8+8 cycles).
+ *
+ * It also likes to notice two shifts in a row, like "a = x << 2" and
+ * "a <<= 7", and convert that to "a = x << 9". But shifts longer
+ * than 8 bits are extra-slow on m68k, so that's a lose.
+ *
+ * Since the 68000 is a very simple in-order processor with no
+ * instruction scheduling effects on execution time, we can safely
+ * take it out of GCC's hands and write one big asm() block.
+ *
+ * Without calling overhead, this operation is 30 bytes (14 instructions
+ * plus one immediate constant) and 166 cycles.
+ *
+ * (Because %2 is fetched twice, it can't be postincrement, and thus it
+ * can't be a fully general "g" or "m". Register is preferred, but
+ * offsettable memory or immediate will work.)
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+ u32 a, b;
+
+ asm( "move.l %2,%0" /* a = x * 0x0001 */
+ "\n lsl.l #2,%0" /* a = x * 0x0004 */
+ "\n move.l %0,%1"
+ "\n lsl.l #7,%0" /* a = x * 0x0200 */
+ "\n add.l %2,%0" /* a = x * 0x0201 */
+ "\n add.l %0,%1" /* b = x * 0x0205 */
+ "\n add.l %0,%0" /* a = x * 0x0402 */
+ "\n add.l %0,%1" /* b = x * 0x0607 */
+ "\n lsl.l #5,%0" /* a = x * 0x8040 */
+ : "=&d,d" (a), "=&r,r" (b)
+ : "r,roi?" (x)); /* a+b = x*0x8647 */
+
+ return ((u16)(x*0x61c8) << 16) + a + b;
+}
+
+#endif /* _ASM_HASH_H */
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index f17c3a4fb697..636e0720fb20 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -16,6 +16,7 @@ config MICROBLAZE
select GENERIC_IRQ_SHOW
select GENERIC_PCI_IOMAP
select GENERIC_SCHED_CLOCK
+ select HAVE_ARCH_HASH
select HAVE_ARCH_KGDB
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
diff --git a/arch/microblaze/include/asm/hash.h b/arch/microblaze/include/asm/hash.h
new file mode 100644
index 000000000000..753513ae8cb0
--- /dev/null
+++ b/arch/microblaze/include/asm/hash.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * Fortunately, most people who want to run Linux on Microblaze enable
+ * both multiplier and barrel shifter, but omitting them is technically
+ * a supported configuration.
+ *
+ * With just a barrel shifter, we can implement an efficient constant
+ * multiply using shifts and adds. GCC can find a 9-step solution, but
+ * this 6-step solution was found by Yevgen Voronenko's implementation
+ * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html.
+ *
+ * That software is really not designed for a single multiplier this large,
+ * but if you run it enough times with different seeds, it'll find several
+ * 6-shift, 6-add sequences for computing x * 0x61C88647. They are all
+ * c = (x << 19) + x;
+ * a = (x << 9) + c;
+ * b = (x << 23) + a;
+ * return (a<<11) + (b<<6) + (c<<3) - b;
+ * with variations on the order of the final add.
+ *
+ * Without even a shifter, it's hopless; any hash function will suck.
+ */
+
+#if CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL == 0
+
+#define HAVE_ARCH__HASH_32 1
+
+/* Multiply by GOLDEN_RATIO_32 = 0x61C88647 */
+static inline u32 __attribute_const__ __hash_32(u32 a)
+{
+#if CONFIG_XILINX_MICROBLAZE0_USE_BARREL
+ unsigned int b, c;
+
+ /* Phase 1: Compute three intermediate values */
+ b = a << 23;
+ c = (a << 19) + a;
+ a = (a << 9) + c;
+ b += a;
+
+ /* Phase 2: Compute (a << 11) + (b << 6) + (c << 3) - b */
+ a <<= 5;
+ a += b; /* (a << 5) + b */
+ a <<= 3;
+ a += c; /* (a << 8) + (b << 3) + c */
+ a <<= 3;
+ return a - b; /* (a << 11) + (b << 6) + (c << 3) - b */
+#else
+ /*
+ * "This is really going to hurt."
+ *
+ * Without a barrel shifter, left shifts are implemented as
+ * repeated additions, and the best we can do is an optimal
+ * addition-subtraction chain. This one is not known to be
+ * optimal, but at 37 steps, it's decent for a 31-bit multiplier.
+ *
+ * Question: given its size (37*4 = 148 bytes per instance),
+ * and slowness, is this worth having inline?
+ */
+ unsigned int b, c, d;
+
+ b = a << 4; /* 4 */
+ c = b << 1; /* 1 5 */
+ b += a; /* 1 6 */
+ c += b; /* 1 7 */
+ c <<= 3; /* 3 10 */
+ c -= a; /* 1 11 */
+ d = c << 7; /* 7 18 */
+ d += b; /* 1 19 */
+ d <<= 8; /* 8 27 */
+ d += a; /* 1 28 */
+ d <<= 1; /* 1 29 */
+ d += b; /* 1 30 */
+ d <<= 6; /* 6 36 */
+ return d + c; /* 1 37 total instructions*/
+#endif
+}
+
+#endif /* !CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL */
+#endif /* _ASM_HASH_H */