From a2c7a98301d9f9bcfc4244de04252a04c0b68a79 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 27 Apr 2018 11:54:40 +0200
Subject: x86/bpf: Clean up non-standard comments, to make the code more
 readable

So by chance I looked into x86 assembly in arch/x86/net/bpf_jit_comp.c and
noticed the weird and inconsistent comment style it mistakenly learned from
the networking code:

 /* Multi-line comment ...
  * ... looks like this.
  */

Fix this to use the standard comment style specified in Documentation/CodingStyle
and used in arch/x86/ as well:

 /*
  * Multi-line comment ...
  * ... looks like this.
  */

Also, to quote Linus's ... more explicit views about this:

  http://article.gmane.org/gmane.linux.kernel.cryptoapi/21066

  > But no, the networking code picked *none* of the above sane formats.
  > Instead, it picked these two models that are just half-arsed
  > shit-for-brains:
  >
  >  (no)
  >      /* This is disgusting drug-induced
  >        * crap, and should die
  >        */
  >
  >   (no-no-no)
  >       /* This is also very nasty
  >        * and visually unbalanced */
  >
  > Please. The networking code actually has the *worst* possible comment
  > style. You can literally find that (no-no-no) style, which is just
  > really horribly disgusting and worse than the otherwise fairly similar
  > (d) in pretty much every way.

Also improve the comments and some other details while at it:

 - Don't mix same-line and previous-line comment style on otherwise
   identical code patterns within the same function,

 - capitalize 'BPF' and x86 register names consistently,

 - capitalize sentences consistently,

 - instead of 'x64' use 'x86-64': x64 is a Microsoft specific term,

 - use more consistent punctuation,

 - use standard coding style in macros as well,

 - fix typos and a few other minor details.

Consistent coding style is not optional, at least in arch/x86/.

No change in functionality.

( In case this commit causes conflicts with pending development code
  I'll be glad to help resolve any conflicts! )

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 233 +++++++++++++++++++++++++-------------------
 1 file changed, 133 insertions(+), 100 deletions(-)

(limited to 'arch/x86/net')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index abce27ceb411..1c3c81ddeb87 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
-/* bpf_jit_comp.c : BPF JIT compiler
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
  *
  * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
  * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -17,7 +18,7 @@
 #include <asm/nospec-branch.h>
 
 /*
- * assembly code in arch/x86/net/bpf_jit.S
+ * Assembly code in arch/x86/net/bpf_jit.S
  */
 extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
 extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
@@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
 #define EMIT1_off32(b1, off) \
-	do {EMIT1(b1); EMIT(off, 4); } while (0)
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
 #define EMIT2_off32(b1, b2, off) \
-	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
 #define EMIT3_off32(b1, b2, b3, off) \
-	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
 #define EMIT4_off32(b1, b2, b3, b4, off) \
-	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 static bool is_imm8(int value)
 {
@@ -70,9 +72,10 @@ static bool is_uimm32(u64 value)
 }
 
 /* mov dst, src */
-#define EMIT_mov(DST, SRC) \
-	do {if (DST != SRC) \
-		EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+#define EMIT_mov(DST, SRC)								 \
+	do {										 \
+		if (DST != SRC)								 \
+			EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
 	} while (0)
 
 static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 		return 0;
 }
 
-/* list of x86 cond jumps opcodes (. + s8)
+/*
+ * List of x86 cond jumps opcodes (. + s8)
  * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
  */
 #define X86_JB  0x72
@@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define CHOOSE_LOAD_FUNC(K, func) \
 	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
-/* pick a register outside of BPF range for JIT internal work */
+/* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
-/* The following table maps BPF registers to x64 registers.
+/*
+ * The following table maps BPF registers to x86-64 registers.
  *
- * x64 register r12 is unused, since if used as base address
+ * x86-64 register R12 is unused, since if used as base address
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- *  r9 caches skb->len - skb->data_len
- * r10 caches skb->data, and used for blinding (if enabled)
+ * R9  caches skb->len - skb->data_len
+ * R10 caches skb->data, and used for blinding (if enabled)
  */
 static const int reg2hex[] = {
-	[BPF_REG_0] = 0,  /* rax */
-	[BPF_REG_1] = 7,  /* rdi */
-	[BPF_REG_2] = 6,  /* rsi */
-	[BPF_REG_3] = 2,  /* rdx */
-	[BPF_REG_4] = 1,  /* rcx */
-	[BPF_REG_5] = 0,  /* r8 */
-	[BPF_REG_6] = 3,  /* rbx callee saved */
-	[BPF_REG_7] = 5,  /* r13 callee saved */
-	[BPF_REG_8] = 6,  /* r14 callee saved */
-	[BPF_REG_9] = 7,  /* r15 callee saved */
-	[BPF_REG_FP] = 5, /* rbp readonly */
-	[BPF_REG_AX] = 2, /* r10 temp register */
-	[AUX_REG] = 3,    /* r11 temp register */
+	[BPF_REG_0] = 0,  /* RAX */
+	[BPF_REG_1] = 7,  /* RDI */
+	[BPF_REG_2] = 6,  /* RSI */
+	[BPF_REG_3] = 2,  /* RDX */
+	[BPF_REG_4] = 1,  /* RCX */
+	[BPF_REG_5] = 0,  /* R8  */
+	[BPF_REG_6] = 3,  /* RBX callee saved */
+	[BPF_REG_7] = 5,  /* R13 callee saved */
+	[BPF_REG_8] = 6,  /* R14 callee saved */
+	[BPF_REG_9] = 7,  /* R15 callee saved */
+	[BPF_REG_FP] = 5, /* RBP readonly */
+	[BPF_REG_AX] = 2, /* R10 temp register */
+	[AUX_REG] = 3,    /* R11 temp register */
 };
 
-/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  * which need extra byte of encoding.
  * rax,rcx,...,rbp have simpler encoding
  */
@@ -153,7 +159,7 @@ static bool is_axreg(u32 reg)
 	return reg == BPF_REG_0;
 }
 
-/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
 	if (is_ereg(reg))
@@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
-/* encode 'dst_reg' register into x64 opcode 'byte' */
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
 	return byte + reg2hex[dst_reg];
 }
 
-/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
 static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 {
 	return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 
 static void jit_fill_hole(void *area, unsigned int size)
 {
-	/* fill whole space with int3 instructions */
+	/* Fill whole space with INT3 instructions */
 	memset(area, 0xcc, size);
 }
 
 struct jit_context {
-	int cleanup_addr; /* epilogue code offset */
+	int cleanup_addr; /* Epilogue code offset */
 	bool seen_ld_abs;
 	bool seen_ax_reg;
 };
 
-/* maximum number of bytes emitted while JITing one eBPF insn */
+/* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
 #define AUX_STACK_SPACE \
-	(32 /* space for rbx, r13, r14, r15 */ + \
-	 8 /* space for skb_copy_bits() buffer */)
+	(32 /* Space for RBX, R13, R14, R15 */ + \
+	  8 /* Space for skb_copy_bits() buffer */)
 
 #define PROLOGUE_SIZE 37
 
-/* emit x64 prologue code for BPF program and check it's size.
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	EMIT1(0x55); /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+	/* push rbp */
+	EMIT1(0x55);
+
+	/* mov rbp,rsp */
+	EMIT3(0x48, 0x89, 0xE5);
 
 	/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
 	EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* all classic BPF filters use R6(rbx) save it */
+	/* All classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
 
-	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
-	 * R8(r14). R9(r15) spill could be made conditional, but there is only
+	/*
+	 * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
+	 * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
+	 * R8(R14). R9(R15) spill could be made conditional, but there is only
 	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
 	 * The overhead of extra spill is negligible for any filter other
 	 * than synthetic ones. Therefore not worth adding complexity.
@@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
 	if (!ebpf_from_cbpf) {
-		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		/*
+		 * Clear the tail call counter (tail_call_cnt): for eBPF tail
 		 * calls we need to reset the counter to 0. It's done in two
-		 * instructions, resetting rax register to 0, and moving it
+		 * instructions, resetting RAX register to 0, and moving it
 		 * to the counter location.
 		 */
 
@@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	*pprog = prog;
 }
 
-/* generate the following code:
+/*
+ * Generate the following code:
+ *
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
@@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	int label1, label2, label3;
 	int cnt = 0;
 
-	/* rdi - pointer to ctx
+	/*
+	 * rdi - pointer to ctx
 	 * rsi - pointer to bpf_array
 	 * rdx - index in bpf_array
 	 */
 
-	/* if (index >= array->map.max_entries)
-	 *   goto out;
+	/*
+	 * if (index >= array->map.max_entries)
+	 *	goto out;
 	 */
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
 	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
-	 *   goto out;
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, 36);              /* mov eax, dword ptr [rbp + 36] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT4_off32(0x48, 0x8B, 0x84, 0xD6,       /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
 		    offsetof(struct bpf_array, ptrs));
 
-	/* if (prog == NULL)
-	 *   goto out;
+	/*
+	 * if (prog == NULL)
+	 *	goto out;
 	 */
 	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
 #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog)
 	      offsetof(struct bpf_prog, bpf_func));
 	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
 
-	/* now we're ready to jump into next BPF program
+	/*
+	 * Wow we're ready to jump into next BPF program
 	 * rdi == ctx (1st arg)
 	 * rax == prog->bpf_func + prologue_size
 	 */
@@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	/* r9d = skb->len - skb->data_len (headlen)
+	/*
+	 * r9d = skb->len - skb->data_len (headlen)
 	 * r10 = skb->data
 	 */
 	/* mov %r9d, off32(%rdi) */
@@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 	u8 b1, b2, b3;
 	int cnt = 0;
 
-	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	/*
+	 * Optimization: if imm32 is positive, use 'mov %eax, imm32'
 	 * (which zero-extends imm32) to save 2 bytes.
 	 */
 	if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 		goto done;
 	}
 
-	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	/*
+	 * Optimization: if imm32 is zero, use 'xor %eax, %eax'
 	 * to save 3 bytes.
 	 */
 	if (imm32 == 0) {
@@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 	int cnt = 0;
 
 	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
-		/* For emitting plain u32, where sign bit must not be
+		/*
+		 * For emitting plain u32, where sign bit must not be
 		 * propagated LLVM tends to load imm64 over mov32
 		 * directly, so save couple of bytes by just doing
 		 * 'mov %eax, imm32' instead.
@@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));
 
-			/* b3 holds 'normal' opcode, b2 short form only valid
+			/*
+			 * b3 holds 'normal' opcode, b2 short form only valid
 			 * in case dst is eax/rax.
 			 */
 			switch (BPF_OP(insn->code)) {
@@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* mov rax, dst_reg */
 			EMIT_mov(BPF_REG_0, dst_reg);
 
-			/* xor edx, edx
+			/*
+			 * xor edx, edx
 			 * equivalent to 'xor rdx, rdx', but one byte less
 			 */
 			EMIT2(0x31, 0xd2);
@@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			}
 			break;
 		}
-			/* shifts */
+			/* Shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:
 		case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* check for bad case when dst_reg == rcx */
+			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
 				EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
 			switch (imm32) {
 			case 16:
-				/* emit 'ror %ax, 8' to swap lower 2 bytes */
+				/* Emit 'ror %ax, 8' to swap lower 2 bytes */
 				EMIT1(0x66);
 				if (is_ereg(dst_reg))
 					EMIT1(0x41);
 				EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
 
-				/* emit 'movzwl eax, ax' */
+				/* Emit 'movzwl eax, ax' */
 				if (is_ereg(dst_reg))
 					EMIT3(0x45, 0x0F, 0xB7);
 				else
@@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'bswap eax' to swap lower 4 bytes */
+				/* Emit 'bswap eax' to swap lower 4 bytes */
 				if (is_ereg(dst_reg))
 					EMIT2(0x41, 0x0F);
 				else
@@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_1reg(0xC8, dst_reg));
 				break;
 			case 64:
-				/* emit 'bswap rax' to swap 8 bytes */
+				/* Emit 'bswap rax' to swap 8 bytes */
 				EMIT3(add_1mod(0x48, dst_reg), 0x0F,
 				      add_1reg(0xC8, dst_reg));
 				break;
@@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_LE:
 			switch (imm32) {
 			case 16:
-				/* emit 'movzwl eax, ax' to zero extend 16-bit
+				/*
+				 * Emit 'movzwl eax, ax' to zero extend 16-bit
 				 * into 64 bit
 				 */
 				if (is_ereg(dst_reg))
@@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'mov eax, eax' to clear upper 32-bits */
+				/* Emit 'mov eax, eax' to clear upper 32-bits */
 				if (is_ereg(dst_reg))
 					EMIT1(0x45);
 				EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +835,9 @@ st:			if (is_imm8(insn->off))
 
 			/* STX: *(u8*)(dst_reg + off) = src_reg */
 		case BPF_STX | BPF_MEM | BPF_B:
-			/* emit 'mov byte ptr [rax + off], al' */
+			/* Emit 'mov byte ptr [rax + off], al' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
-			    /* have to add extra byte for x86 SIL, DIL regs */
+			    /* We have to add extra byte for x86 SIL, DIL regs */
 			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
 				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
 			else
@@ -840,25 +866,26 @@ stx:			if (is_imm8(insn->off))
 
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
-			/* emit 'movzx rax, byte ptr [rax + off]' */
+			/* Emit 'movzx rax, byte ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
-			/* emit 'movzx rax, word ptr [rax + off]' */
+			/* Emit 'movzx rax, word ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
-			/* emit 'mov eax, dword ptr [rax+0x14]' */
+			/* Emit 'mov eax, dword ptr [rax+0x14]' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
 			else
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
-			/* emit 'mov rax, qword ptr [rax+0x14]' */
+			/* Emit 'mov rax, qword ptr [rax+0x14]' */
 			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx:			/* if insn->off == 0 we can save one extra byte, but
-			 * special case of x86 r13 which always needs an offset
+ldx:			/*
+			 * If insn->off == 0 we can save one extra byte, but
+			 * special case of x86 R13 which always needs an offset
 			 * is not worth the hassle
 			 */
 			if (is_imm8(insn->off))
@@ -870,7 +897,7 @@ ldx:			/* if insn->off == 0 we can save one extra byte, but
 
 			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
 		case BPF_STX | BPF_XADD | BPF_W:
-			/* emit 'lock add dword ptr [rax + off], eax' */
+			/* Emit 'lock add dword ptr [rax + off], eax' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
 			else
@@ -897,14 +924,15 @@ xadd:			if (is_imm8(insn->off))
 				} else {
 					EMIT2(0x41, 0x52); /* push %r10 */
 					EMIT2(0x41, 0x51); /* push %r9 */
-					/* need to adjust jmp offset, since
+					/*
+					 * We need to adjust jmp offset, since
 					 * pop %r9, pop %r10 take 4 bytes after call insn
 					 */
 					jmp_offset += 4;
 				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -970,7 +998,7 @@ xadd:			if (is_imm8(insn->off))
 			else
 				EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
 
-emit_cond_jmp:		/* convert BPF opcode to x86 */
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
 			switch (BPF_OP(insn->code)) {
 			case BPF_JEQ:
 				jmp_cond = X86_JE;
@@ -996,22 +1024,22 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_cond = X86_JBE;
 				break;
 			case BPF_JSGT:
-				/* signed '>', GT in x86 */
+				/* Signed '>', GT in x86 */
 				jmp_cond = X86_JG;
 				break;
 			case BPF_JSLT:
-				/* signed '<', LT in x86 */
+				/* Signed '<', LT in x86 */
 				jmp_cond = X86_JL;
 				break;
 			case BPF_JSGE:
-				/* signed '>=', GE in x86 */
+				/* Signed '>=', GE in x86 */
 				jmp_cond = X86_JGE;
 				break;
 			case BPF_JSLE:
-				/* signed '<=', LE in x86 */
+				/* Signed '<=', LE in x86 */
 				jmp_cond = X86_JLE;
 				break;
-			default: /* to silence gcc warning */
+			default: /* to silence GCC warning */
 				return -EFAULT;
 			}
 			jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1039,7 +1067,7 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_offset = addrs[i + insn->off] - addrs[i];
 
 			if (!jmp_offset)
-				/* optimize out nop jumps */
+				/* Optimize out nop jumps */
 				break;
 emit_jmp:
 			if (is_imm8(jmp_offset)) {
@@ -1061,7 +1089,7 @@ common_load:
 			ctx->seen_ld_abs = seen_ld_abs = true;
 			jmp_offset = func - (image + addrs[i]);
 			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -1080,7 +1108,8 @@ common_load:
 						EMIT2_off32(0x81, 0xC6, imm32);
 				}
 			}
-			/* skb pointer is in R6 (%rbx), it will be copied into
+			/*
+			 * skb pointer is in R6 (%rbx), it will be copied into
 			 * %rdi if skb_copy_bits() call is necessary.
 			 * sk_load_* helpers also use %r10 and %r9d.
 			 * See bpf_jit.S
@@ -1111,7 +1140,7 @@ common_load:
 				goto emit_jmp;
 			}
 			seen_exit = true;
-			/* update cleanup_addr */
+			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp+0] */
 			EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1129,10 +1158,11 @@ common_load:
 			break;
 
 		default:
-			/* By design x64 JIT should support all BPF instructions
+			/*
+			 * By design x86-64 JIT should support all BPF instructions.
 			 * This error will be seen if new instruction was added
-			 * to interpreter, but not to JIT
-			 * or if there is junk in bpf_prog
+			 * to the interpreter, but not to the JIT, or if there is
+			 * junk in bpf_prog.
 			 */
 			pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
 			return -EINVAL;
@@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		return orig_prog;
 
 	tmp = bpf_jit_blind_constants(prog);
-	/* If blinding was requested and we failed during blinding,
+	/*
+	 * If blinding was requested and we failed during blinding,
 	 * we must fall back to the interpreter.
 	 */
 	if (IS_ERR(tmp))
@@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_addrs;
 	}
 
-	/* Before first pass, make a rough estimation of addrs[]
-	 * each bpf instruction is translated to less than 64 bytes
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
 	 */
 	for (proglen = 0, i = 0; i < prog->len; i++) {
 		proglen += 64;
@@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.cleanup_addr = proglen;
 skip_init_addrs:
 
-	/* JITed image shrinks with every pass and the loop iterates
-	 * until the image stops shrinking. Very large bpf programs
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
 	 * may converge on the last pass. In such case do one more
-	 * pass to emit the final image
+	 * pass to emit the final image.
 	 */
 	for (pass = 0; pass < 20 || image; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
-- 
cgit 


From 03f5781be2c7b7e728d724ac70ba10799cc710d7 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Thu, 3 May 2018 14:10:43 +0800
Subject: bpf, x86_32: add eBPF JIT compiler for ia32

The JIT compiler emits ia32 bit instructions. Currently, It supports eBPF
only. Classic BPF is supported because of the conversion by BPF core.

Almost all instructions from eBPF ISA supported except the following:
BPF_ALU64 | BPF_DIV | BPF_K
BPF_ALU64 | BPF_DIV | BPF_X
BPF_ALU64 | BPF_MOD | BPF_K
BPF_ALU64 | BPF_MOD | BPF_X
BPF_STX | BPF_XADD | BPF_W
BPF_STX | BPF_XADD | BPF_DW

It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL at the moment.

IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI. I use
EAX|EDX|ECX|EBX as temporary registers to simulate instructions in eBPF
ISA, and allocate ESI|EDI to BPF_REG_AX for constant blinding, all others
eBPF registers, R0-R10, are simulated through scratch space on stack.

The reasons behind the hardware registers allocation policy are:
1:MUL need EAX:EDX, shift operation need ECX, so they aren't fit
  for general eBPF 64bit register simulation.
2:We need at least 4 registers to simulate most eBPF ISA operations
  on registers operands instead of on register&memory operands.
3:We need to put BPF_REG_AX on hardware registers, or constant blinding
  will degrade jit performance heavily.

Tested on PC (Intel(R) Core(TM) i5-5200U CPU).
Testing results on i5-5200U:
1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed]
2) test_progs: Summary: 83 PASSED, 0 FAILED.
3) test_lpm: OK
4) test_lru_map: OK
5) test_verifier: Summary: 828 PASSED, 0 FAILED.

Above tests are all done in following two conditions separately:
1:bpf_jit_enable=1 and bpf_jit_harden=0
2:bpf_jit_enable=1 and bpf_jit_harden=2

Below are some numbers for this jit implementation:
Note:
  I run test_progs in kselftest 100 times continuously for every condition,
  the numbers are in format: total/times=avg.
  The numbers that test_bpf reports show almost the same relation.

a:jit_enable=0 and jit_harden=0            b:jit_enable=1 and jit_harden=0
  test_pkt_access:PASS:ipv4:15622/100=156    test_pkt_access:PASS:ipv4:10674/100=106
  test_pkt_access:PASS:ipv6:9130/100=91      test_pkt_access:PASS:ipv6:4855/100=48
  test_xdp:PASS:ipv4:240198/100=2401         test_xdp:PASS:ipv4:138912/100=1389
  test_xdp:PASS:ipv6:137326/100=1373         test_xdp:PASS:ipv6:68542/100=685
  test_l4lb:PASS:ipv4:61100/100=611          test_l4lb:PASS:ipv4:37302/100=373
  test_l4lb:PASS:ipv6:101000/100=1010        test_l4lb:PASS:ipv6:55030/100=550

c:jit_enable=1 and jit_harden=2
  test_pkt_access:PASS:ipv4:10558/100=105
  test_pkt_access:PASS:ipv6:5092/100=50
  test_xdp:PASS:ipv4:131902/100=1319
  test_xdp:PASS:ipv6:77932/100=779
  test_l4lb:PASS:ipv4:38924/100=389
  test_l4lb:PASS:ipv6:57520/100=575

The numbers show we get 30%~50% improvement.

See Documentation/networking/filter.txt for more information.

Changelog:

 Changes v5-v6:
 1:Add do {} while (0) to RETPOLINE_RAX_BPF_JIT for
   consistence reason.
 2:Clean up non-standard comments, reported by Daniel Borkmann.
 3:Fix a memory leak issue, repoted by Daniel Borkmann.

 Changes v4-v5:
 1:Delete is_on_stack, BPF_REG_AX is the only one
   on real hardware registers, so just check with
   it.
 2:Apply commit 1612a981b766 ("bpf, x64: fix JIT emission
   for dead code"), suggested by Daniel Borkmann.

 Changes v3-v4:
 1:Fix changelog in commit.
   I install llvm-6.0, then test_progs willn't report errors.
   I submit another patch:
   "bpf: fix misaligned access for BPF_PROG_TYPE_PERF_EVENT program type on x86_32 platform"
   to fix another problem, after that patch, test_verifier willn't report errors too.
 2:Fix clear r0[1] twice unnecessarily in *BPF_IND|BPF_ABS* simulation.

 Changes v2-v3:
 1:Move BPF_REG_AX to real hardware registers for performance reason.
 3:Using bpf_load_pointer instead of bpf_jit32.S, suggested by Daniel Borkmann.
 4:Delete partial codes in 1c2a088a6626, suggested by Daniel Borkmann.
 5:Some bug fixes and comments improvement.

 Changes v1-v2:
 1:Fix bug in emit_ia32_neg64.
 2:Fix bug in emit_ia32_arsh_r64.
 3:Delete filename in top level comment, suggested by Thomas Gleixner.
 4:Delete unnecessary boiler plate text, suggested by Thomas Gleixner.
 5:Rewrite some words in changelog.
 6:CodingSytle improvement and a little more comments.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/Makefile         |    8 +-
 arch/x86/net/bpf_jit_comp32.c | 2553 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2559 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/net/bpf_jit_comp32.c

(limited to 'arch/x86/net')

diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b619598..c6b464a261bb 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,10 @@
 #
 # Arch-specific network modules
 #
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
 
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+ifeq ($(CONFIG_X86_32),y)
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+        OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+        obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000000..61e61341b777
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2553 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ *                         high
+ * original ESP =>        +-----+
+ *                        |     | callee saved registers
+ *                        +-----+
+ *                        | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP  =>    +-----+
+ *                        | ... | eBPF prog stack
+ *                        +-----+
+ *                        |RSVD | JIT scratchpad
+ * current ESP =>         +-----+
+ *                        |     |
+ *                        | ... | Function call stack
+ *                        |     |
+ *                        +-----+
+ *                          low
+ *
+ * The callee saved registers:
+ *
+ *                                high
+ * original ESP =>        +------------------+ \
+ *                        |        ebp       | |
+ * current EBP =>         +------------------+ } callee saved registers
+ *                        |    ebx,esi,edi   | |
+ *                        +------------------+ /
+ *                                low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+	if (len == 1)
+		*ptr = bytes;
+	else if (len == 2)
+		*(u16 *)ptr = bytes;
+	else {
+		*(u32 *)ptr = bytes;
+		barrier();
+	}
+	return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1)		EMIT(b1, 1)
+#define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4)   \
+	EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+	return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+	return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k)	(k)
+#define TCALL_CNT	(MAX_BPF_JIT_REG + 0)	/* Tail Call Count */
+
+#define IA32_EAX	(0x0)
+#define IA32_EBX	(0x3)
+#define IA32_ECX	(0x1)
+#define IA32_EDX	(0x2)
+#define IA32_ESI	(0x6)
+#define IA32_EDI	(0x7)
+#define IA32_EBP	(0x5)
+#define IA32_ESP	(0x4)
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB  0x72
+#define IA32_JAE 0x73
+#define IA32_JE  0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA  0x77
+#define IA32_JL  0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG  0x7F
+
+/*
+ * Map eBPF registers to IA32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ *    registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ *    mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
+ * registers, we have to map each eBPF registers with two IA32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+	/* Return value from in-kernel function, and exit value from eBPF */
+	[BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+	/* The arguments from eBPF program to in-kernel function */
+	/* Stored on stack scratch space */
+	[BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+	[BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+	[BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+	[BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+	[BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+	/* Callee saved registers that in-kernel function will preserve */
+	/* Stored on stack scratch space */
+	[BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+	[BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+	[BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+	[BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+	/* Read only Frame Pointer to access Stack */
+	[BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+	/* Temporary register for blinding constants. */
+	[BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+	/* Tail call count. Stored on stack scratch space. */
+	[TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo	dst[0]
+#define dst_hi	dst[1]
+#define src_lo	src[0]
+#define src_hi	src[1]
+
+#define STACK_ALIGNMENT	8
+/*
+ * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* Total stack size used in JITed code */
+#define _STACK_SIZE \
+	(stack_depth + \
+	 + SCRATCH_SIZE + \
+	 + 4 /* Extra space for skb_copy_bits buffer */)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Offset of skb_copy_bits buffer */
+#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+
+/* Encode 'dst_reg' register into IA32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+	return byte + dst_reg;
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+	return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+	/* Fill whole space with int3 instructions */
+	memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk) {
+		if (val == 0) {
+			/* xor eax,eax */
+			EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		} else {
+			EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+				    STACK_VAR(dst), val);
+		}
+	} else {
+		if (val == 0)
+			EMIT2(0x33, add_2reg(0xC0, dst, dst));
+		else
+			EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+				    val);
+	}
+	*pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+	else
+		/* mov dst,sreg */
+		EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+	*pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+				     const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+	if (is64)
+		/* complete 8 byte move */
+		emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+	else
+		/* zero out high 4 bytes */
+		emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+				     const u32 val, bool dstk, u8 **pprog)
+{
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+	emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+	emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_ECX : src;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+	EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov dst,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+					 bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk && val != 64) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/*
+		 * Emit 'movzwl eax,ax' to zero extend 16-bit
+		 * into 64 bit
+		 */
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* nop */
+		break;
+	}
+
+	if (dstk && val != 64) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+				       bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/* Emit 'ror %ax, 8' to swap lower 2 bytes */
+		EMIT1(0x66);
+		EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* Emit 'bswap edx' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_hi));
+
+		/* mov ecx,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* mov dreg_lo,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+		break;
+	}
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+				       bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+	/* xor edx,edx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* div ecx */
+	EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+	if (op == BPF_MOD) {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+	} else {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 b2;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	switch (op) {
+	case BPF_LSH:
+		b2 = 0xE0; break;
+	case BPF_RSH:
+		b2 = 0xE8; break;
+	case BPF_ARSH:
+		b2 = 0xF8; break;
+	default:
+		return;
+	}
+	EMIT2(0xD3, add_1reg(b2, dreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+	u8 dreg = dstk ? IA32_EDX : dst;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+	switch (BPF_OP(op)) {
+	/* dst = dst + src */
+	case BPF_ADD:
+		if (hi && is64)
+			EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst - src */
+	case BPF_SUB:
+		if (hi && is64)
+			EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst | src */
+	case BPF_OR:
+		EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & src */
+	case BPF_AND:
+		EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ src */
+	case BPF_XOR:
+		EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+				     const u8 dst[], const u8 src[],
+				     bool dstk,  bool sstk,
+				     u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+	if (is64)
+		emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+				&prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const s32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 sreg = IA32_EDX;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (!is_imm8(val))
+		/* mov edx,imm32*/
+		EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+	switch (op) {
+	/* dst = dst + val */
+	case BPF_ADD:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD0, dreg), val);
+			else
+				EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xC0, dreg), val);
+			else
+				EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst - val */
+	case BPF_SUB:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD8, dreg), val);
+			else
+				EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xE8, dreg), val);
+			else
+				EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst | val */
+	case BPF_OR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xC8, dreg), val);
+		else
+			EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & val */
+	case BPF_AND:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xE0, dreg), val);
+		else
+			EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ val */
+	case BPF_XOR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xF0, dreg), val);
+		else
+			EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	case BPF_NEG:
+		EMIT2(0xF7, add_1reg(0xD8, dreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+				     const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+
+	emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+	if (is64)
+		emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+	*pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sub dreg_lo,ecx */
+	EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
+	/* mov dreg_lo,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sbb dreg_hi,ecx */
+	EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
+	/* mov dreg_hi,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* shl dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
+	/* mov ebx,dreg_lo */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shr ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+	/* or dreg_hi,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+	/* mov dreg_hi,dreg_lo */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+				      bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do LSH operation */
+	if (val < 32) {
+		/* shl dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
+		/* mov ebx,dreg_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shr ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+		/* or dreg_hi,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+				      bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	} else {
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_hi));
+	else
+		/* mov eax,dst_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+	else
+		/* mul src_hi */
+		EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+	/* add eax,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dst_lo,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dst_hi,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u32 hi;
+
+	hi = val & (1<<31) ? (u32)~0 : 0;
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+	else
+		/* mul dst_hi */
+		EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+	/* add ecx,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 4; /* imm32 */
+	else
+		return 0;
+}
+
+struct jit_context {
+	int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE	128
+#define BPF_INSN_SAFETY		64
+
+#define PROLOGUE_SIZE 35
+
+/*
+ * Emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program.
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+	const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+	/* push ebp */
+	EMIT1(0x55);
+	/* mov ebp,esp */
+	EMIT2(0x89, 0xE5);
+	/* push edi */
+	EMIT1(0x57);
+	/* push esi */
+	EMIT1(0x56);
+	/* push ebx */
+	EMIT1(0x53);
+
+	/* sub esp,STACK_SIZE */
+	EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+	/* sub ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+	/* xor ebx,ebx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+	/* Set up BPF prog stack base register */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+	/* Move BPF_CTX (EAX) to BPF_REG_R1 */
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+	/* Initialize Tail Count */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	const u8 *r0 = bpf2ia32[BPF_REG_0];
+	int cnt = 0;
+
+	/* mov eax,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+	/* mov edx,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+	/* add ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+	/* mov ebx,dword ptr [ebp-12]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+	/* mov esi,dword ptr [ebp-8]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+	/* mov edi,dword ptr [ebp-4]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+	EMIT1(0xC9); /* leave */
+	EMIT1(0xC3); /* ret */
+	*pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->ptrs[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 *r2 = bpf2ia32[BPF_REG_2];
+	const u8 *r3 = bpf2ia32[BPF_REG_3];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+	u32 lo, hi;
+	static int jmp_label1 = -1;
+
+	/*
+	 * if (index >= array->map.max_entries)
+	 *     goto out;
+	 */
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+	/* mov edx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+	/* cmp dword ptr [eax+off],edx */
+	EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+	      offsetof(struct bpf_array, map.max_entries));
+	/* jbe out */
+	EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *     goto out;
+	 */
+	lo = (u32)MAX_TAIL_CALL_CNT;
+	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* cmp edx,hi */
+	EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
+	EMIT2(IA32_JNE, 3);
+	/* cmp ecx,lo */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
+
+	/* ja out */
+	EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+
+	/* add eax,0x1 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
+	/* adc ebx,0x0 */
+	EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
+
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	/* mov dword ptr [ebp+off],edx */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* prog = array->ptrs[index]; */
+	/* mov edx, [eax + edx * 4 + offsetof(...)] */
+	EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+	/*
+	 * if (prog == NULL)
+	 *     goto out;
+	 */
+	/* test edx,edx */
+	EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* je out */
+	EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	/* mov edx, dword ptr [edx + 32] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+	      offsetof(struct bpf_prog, bpf_func));
+	/* add edx,prologue_size */
+	EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+	/*
+	 * Now we're ready to jump into next BPF program:
+	 * eax == ctx (1st arg)
+	 * edx == prog->bpf_func + prologue_size
+	 */
+	RETPOLINE_EDX_BPF_JIT();
+
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* out: */
+	*pprog = prog;
+}
+
+/* Push the scratch stack register on top of the stack. */
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+	/* push ecx */
+	EMIT1(0x51);
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+	/* push ecx */
+	EMIT1(0x51);
+
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		const s32 imm32 = insn->imm;
+		const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+		const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+		const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+		const u8 code = insn->code;
+		const u8 *dst = bpf2ia32[insn->dst_reg];
+		const u8 *src = bpf2ia32[insn->src_reg];
+		const u8 *r0 = bpf2ia32[BPF_REG_0];
+		s64 jmp_offset;
+		u8 jmp_cond;
+		int ilen;
+		u8 *func;
+
+		switch (code) {
+		/* ALU operations */
+		/* dst = src */
+		case BPF_ALU | BPF_MOV | BPF_K:
+		case BPF_ALU | BPF_MOV | BPF_X:
+		case BPF_ALU64 | BPF_MOV | BPF_K:
+		case BPF_ALU64 | BPF_MOV | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mov_r64(is64, dst, src, dstk,
+						  sstk, &prog);
+				break;
+			case BPF_K:
+				/* Sign-extend immediate value to dst reg */
+				emit_ia32_mov_i64(is64, dst, imm32,
+						  dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = dst + src/imm */
+		/* dst = dst - src/imm */
+		/* dst = dst | src/imm */
+		/* dst = dst & src/imm */
+		/* dst = dst ^ src/imm */
+		/* dst = dst * src/imm */
+		/* dst = dst << src */
+		/* dst = dst >> src */
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU64 | BPF_ADD | BPF_K:
+		case BPF_ALU64 | BPF_ADD | BPF_X:
+		case BPF_ALU64 | BPF_SUB | BPF_K:
+		case BPF_ALU64 | BPF_SUB | BPF_X:
+		case BPF_ALU64 | BPF_OR | BPF_K:
+		case BPF_ALU64 | BPF_OR | BPF_X:
+		case BPF_ALU64 | BPF_AND | BPF_K:
+		case BPF_ALU64 | BPF_AND | BPF_X:
+		case BPF_ALU64 | BPF_XOR | BPF_K:
+		case BPF_ALU64 | BPF_XOR | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+						  src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+						  imm32, dstk, &prog);
+				break;
+			}
+			break;
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r(dst_lo, src_lo, dstk,
+						sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+						false, &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_ARSH | BPF_K:
+		case BPF_ALU | BPF_ARSH | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+						  dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_shift_r(BPF_OP(code), dst_lo,
+						  IA32_ECX, dstk, false,
+						  &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst / src(imm) */
+		/* dst = dst % src(imm) */
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    src_lo, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    IA32_ECX, dstk, false,
+						    &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU64 | BPF_DIV | BPF_K:
+		case BPF_ALU64 | BPF_DIV | BPF_X:
+		case BPF_ALU64 | BPF_MOD | BPF_K:
+		case BPF_ALU64 | BPF_MOD | BPF_X:
+			goto notyet;
+		/* dst = dst >> imm */
+		/* dst = dst << imm */
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 31))
+				return -EINVAL;
+			/* mov ecx,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+					  false, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst << imm */
+		case BPF_ALU64 | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst >> imm */
+		case BPF_ALU64 | BPF_RSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst << src */
+		case BPF_ALU64 | BPF_LSH | BPF_X:
+			emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src */
+		case BPF_ALU64 | BPF_RSH | BPF_X:
+			emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> imm (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = ~dst */
+		case BPF_ALU | BPF_NEG:
+			emit_ia32_alu_i(is64, false, BPF_OP(code),
+					dst_lo, 0, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = ~dst (64 bit) */
+		case BPF_ALU64 | BPF_NEG:
+			emit_ia32_neg64(dst, dstk, &prog);
+			break;
+		/* dst = dst * src/imm */
+		case BPF_ALU64 | BPF_MUL | BPF_X:
+		case BPF_ALU64 | BPF_MUL | BPF_K:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = htole(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_LE:
+			emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = htobe(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_BE:
+			emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = imm64 */
+		case BPF_LD | BPF_IMM | BPF_DW: {
+			s32 hi, lo = imm32;
+
+			hi = insn[1].imm;
+			emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+			insn++;
+			i++;
+			break;
+		}
+		/* ST: *(u8*)(dst_reg + off) = imm */
+		case BPF_ST | BPF_MEM | BPF_H:
+		case BPF_ST | BPF_MEM | BPF_B:
+		case BPF_ST | BPF_MEM | BPF_W:
+		case BPF_ST | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0xC6, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0xC7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0xC7, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+			else
+				EMIT1_off32(add_1reg(0x80, IA32_EAX),
+					    insn->off);
+			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				u32 hi;
+
+				hi = imm32 & (1<<31) ? (u32)~0 : 0;
+				EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+					    insn->off + 4);
+				EMIT(hi, 4);
+			}
+			break;
+
+		/* STX: *(u8*)(dst_reg + off) = src_reg */
+		case BPF_STX | BPF_MEM | BPF_B:
+		case BPF_STX | BPF_MEM | BPF_H:
+		case BPF_STX | BPF_MEM | BPF_W:
+		case BPF_STX | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			if (sstk)
+				/* mov edx,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov edx,src_lo */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0x88, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0x89); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x89, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				if (sstk)
+					/* mov edi,dword ptr [ebp+off] */
+					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+							     IA32_EDX),
+					      STACK_VAR(src_hi));
+				else
+					/* mov edi,src_hi */
+					EMIT2(0x8B, add_2reg(0xC0, src_hi,
+							     IA32_EDX));
+				EMIT1(0x89);
+				if (is_imm8(insn->off + 4)) {
+					EMIT2(add_2reg(0x40, IA32_EAX,
+						       IA32_EDX),
+					      insn->off + 4);
+				} else {
+					EMIT1(add_2reg(0x80, IA32_EAX,
+						       IA32_EDX));
+					EMIT(insn->off + 4, 4);
+				}
+			}
+			break;
+
+		/* LDX: dst_reg = *(u8*)(src_reg + off) */
+		case BPF_LDX | BPF_MEM | BPF_B:
+		case BPF_LDX | BPF_MEM | BPF_H:
+		case BPF_LDX | BPF_MEM | BPF_W:
+		case BPF_LDX | BPF_MEM | BPF_DW:
+			if (sstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT2(0x0F, 0xB6); break;
+			case BPF_H:
+				EMIT2(0x0F, 0xB7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x8B, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (dstk)
+				/* mov dword ptr [ebp+off],edx */
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov dst_lo,edx */
+				EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+			case BPF_H:
+			case BPF_W:
+				if (dstk) {
+					EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+					      STACK_VAR(dst_hi));
+					EMIT(0x0, 4);
+				} else {
+					EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+				}
+				break;
+			case BPF_DW:
+				EMIT2_off32(0x8B,
+					    add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off + 4);
+				if (dstk)
+					EMIT3(0x89,
+					      add_2reg(0x40, IA32_EBP,
+						       IA32_EDX),
+					      STACK_VAR(dst_hi));
+				else
+					EMIT2(0x89,
+					      add_2reg(0xC0, dst_hi, IA32_EDX));
+				break;
+			default:
+				break;
+			}
+			break;
+		/* call */
+		case BPF_JMP | BPF_CALL:
+		{
+			const u8 *r1 = bpf2ia32[BPF_REG_1];
+			const u8 *r2 = bpf2ia32[BPF_REG_2];
+			const u8 *r3 = bpf2ia32[BPF_REG_3];
+			const u8 *r4 = bpf2ia32[BPF_REG_4];
+			const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				goto notyet;
+
+			func = (u8 *) __bpf_call_base + imm32;
+			jmp_offset = func - (image + addrs[i]);
+
+			if (!imm32 || !is_simm32(jmp_offset)) {
+				pr_err("unsupported BPF func %d addr %p image %p\n",
+				       imm32, func, image);
+				return -EINVAL;
+			}
+
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r1[0]));
+			/* mov edx,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r1[1]));
+
+			emit_push_r64(r5, &prog);
+			emit_push_r64(r4, &prog);
+			emit_push_r64(r3, &prog);
+			emit_push_r64(r2, &prog);
+
+			EMIT1_off32(0xE8, jmp_offset + 9);
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[1]));
+
+			/* add esp,32 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+			break;
+		}
+		case BPF_JMP | BPF_TAIL_CALL:
+			emit_bpf_tail_call(&prog);
+			break;
+
+		/* cond jump */
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JNE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JLT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JLE | BPF_X:
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSLE | BPF_X:
+		case BPF_JMP | BPF_JSLT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JNE | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JLT | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JLE | BPF_K:
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSLE | BPF_K:
+		case BPF_JMP | BPF_JSLT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
+			switch (BPF_OP(code)) {
+			case BPF_JEQ:
+				jmp_cond = IA32_JE;
+				break;
+			case BPF_JSET:
+			case BPF_JNE:
+				jmp_cond = IA32_JNE;
+				break;
+			case BPF_JGT:
+				/* GT is unsigned '>', JA in x86 */
+				jmp_cond = IA32_JA;
+				break;
+			case BPF_JLT:
+				/* LT is unsigned '<', JB in x86 */
+				jmp_cond = IA32_JB;
+				break;
+			case BPF_JGE:
+				/* GE is unsigned '>=', JAE in x86 */
+				jmp_cond = IA32_JAE;
+				break;
+			case BPF_JLE:
+				/* LE is unsigned '<=', JBE in x86 */
+				jmp_cond = IA32_JBE;
+				break;
+			case BPF_JSGT:
+				/* Signed '>', GT in x86 */
+				jmp_cond = IA32_JG;
+				break;
+			case BPF_JSLT:
+				/* Signed '<', LT in x86 */
+				jmp_cond = IA32_JL;
+				break;
+			case BPF_JSGE:
+				/* Signed '>=', GE in x86 */
+				jmp_cond = IA32_JGE;
+				break;
+			case BPF_JSLE:
+				/* Signed '<=', LE in x86 */
+				jmp_cond = IA32_JLE;
+				break;
+			default: /* to silence GCC warning */
+				return -EFAULT;
+			}
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_imm8(jmp_offset)) {
+				EMIT2(jmp_cond, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+
+			break;
+		}
+		case BPF_JMP | BPF_JA:
+			if (insn->off == -1)
+				/* -1 jmp instructions will always jump
+				 * backwards two bytes. Explicitly handling
+				 * this case avoids wasting too many passes
+				 * when there are long sequences of replaced
+				 * dead code.
+				 */
+				jmp_offset = -2;
+			else
+				jmp_offset = addrs[i + insn->off] - addrs[i];
+
+			if (!jmp_offset)
+				/* Optimize out nop jumps */
+				break;
+emit_jmp:
+			if (is_imm8(jmp_offset)) {
+				EMIT2(0xEB, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT1_off32(0xE9, jmp_offset);
+			} else {
+				pr_err("jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			break;
+
+		case BPF_LD | BPF_ABS | BPF_W:
+		case BPF_LD | BPF_ABS | BPF_H:
+		case BPF_LD | BPF_ABS | BPF_B:
+		case BPF_LD | BPF_IND | BPF_W:
+		case BPF_LD | BPF_IND | BPF_H:
+		case BPF_LD | BPF_IND | BPF_B:
+		{
+			int size;
+			const u8 *r6 = bpf2ia32[BPF_REG_6];
+
+			/* Setting up first argument */
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r6[0]));
+
+			/* Setting up second argument */
+			if (BPF_MODE(code) == BPF_ABS) {
+				/* mov %edx, imm32 */
+				EMIT1_off32(0xBA, imm32);
+			} else {
+				if (sstk)
+					/* mov edx,dword ptr [ebp+off] */
+					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+							     IA32_EDX),
+					      STACK_VAR(src_lo));
+				else
+					/* mov edx,src_lo */
+					EMIT2(0x8B, add_2reg(0xC0, src_lo,
+							     IA32_EDX));
+				if (imm32) {
+					if (is_imm8(imm32))
+						/* add %edx,imm8 */
+						EMIT3(0x83, 0xC2, imm32);
+					else
+						/* add %edx,imm32 */
+						EMIT2_off32(0x81, 0xC2, imm32);
+				}
+			}
+
+			/* Setting up third argument */
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				size = 4;
+				break;
+			case BPF_H:
+				size = 2;
+				break;
+			case BPF_B:
+				size = 1;
+				break;
+			default:
+				return -EINVAL;
+			}
+			/* mov ecx,val */
+			EMIT2(0xB1, size);
+			/* movzx ecx,ecx */
+			EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+
+			/* mov ebx,ebp */
+			EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX));
+			/* add %ebx,imm8 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER);
+			/* push ebx */
+			EMIT1(0x53);
+
+			/* Setting up function pointer to call */
+			/* mov ebx,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX),
+				    (unsigned int)bpf_load_pointer);
+
+			EMIT2(0xFF, add_1reg(0xD0, IA32_EBX));
+			/* add %esp,4 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4);
+			/* xor edx,edx */
+			EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[1]));
+
+			/*
+			 * Check if return address is NULL or not.
+			 * If NULL then jump to epilogue else continue
+			 * to load the value from retn address
+			 */
+			EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0);
+			jmp_offset = ctx->cleanup_addr - addrs[i];
+
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				jmp_offset += 7;
+				break;
+			case BPF_H:
+				jmp_offset += 10;
+				break;
+			case BPF_B:
+				jmp_offset += 6;
+				break;
+			}
+
+			EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset);
+			/* Load value from the address */
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				/* mov eax,[eax] */
+				EMIT2(0x8B, 0x0);
+				/* Emit 'bswap eax' */
+				EMIT2(0x0F, add_1reg(0xC8, IA32_EAX));
+				break;
+			case BPF_H:
+				EMIT3(0x0F, 0xB7, 0x0);
+				EMIT1(0x66);
+				EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8);
+				break;
+			case BPF_B:
+				EMIT3(0x0F, 0xB6, 0x0);
+				break;
+			}
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			break;
+		}
+		/* STX XADD: lock *(u32 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_W:
+		/* STX XADD: lock *(u64 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_DW:
+			goto notyet;
+		case BPF_JMP | BPF_EXIT:
+			if (seen_exit) {
+				jmp_offset = ctx->cleanup_addr - addrs[i];
+				goto emit_jmp;
+			}
+			seen_exit = true;
+			/* Update cleanup_addr */
+			ctx->cleanup_addr = proglen;
+			emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+			break;
+notyet:
+			pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+			return -EFAULT;
+		default:
+			/*
+			 * This error will be seen if new instruction was added
+			 * to interpreter, but not to JIT or if there is junk in
+			 * bpf_prog
+			 */
+			pr_err("bpf_jit: unknown opcode %02x\n", code);
+			return -EINVAL;
+		}
+
+		ilen = prog - temp;
+		if (ilen > BPF_MAX_INSN_SIZE) {
+			pr_err("bpf_jit: fatal insn size error\n");
+			return -EFAULT;
+		}
+
+		if (image) {
+			if (unlikely(proglen + ilen > oldproglen)) {
+				pr_err("bpf_jit: fatal error\n");
+				return -EFAULT;
+			}
+			memcpy(image + proglen, temp, ilen);
+		}
+		proglen += ilen;
+		addrs[i] = proglen;
+		prog = temp;
+	}
+	return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	struct bpf_binary_header *header = NULL;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	int proglen, oldproglen = 0;
+	struct jit_context ctx = {};
+	bool tmp_blinded = false;
+	u8 *image = NULL;
+	int *addrs;
+	int pass;
+	int i;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	/*
+	 * If blinding was requested and we failed during blinding,
+	 * we must fall back to the interpreter.
+	 */
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+	if (!addrs) {
+		prog = orig_prog;
+		goto out;
+	}
+
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
+	 */
+	for (proglen = 0, i = 0; i < prog->len; i++) {
+		proglen += 64;
+		addrs[i] = proglen;
+	}
+	ctx.cleanup_addr = proglen;
+
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image.
+	 */
+	for (pass = 0; pass < 20 || image; pass++) {
+		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+		if (proglen <= 0) {
+out_image:
+			image = NULL;
+			if (header)
+				bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_addrs;
+		}
+		if (image) {
+			if (proglen != oldproglen) {
+				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+				       proglen, oldproglen);
+				goto out_image;
+			}
+			break;
+		}
+		if (proglen == oldproglen) {
+			header = bpf_jit_binary_alloc(proglen, &image,
+						      1, jit_fill_hole);
+			if (!header) {
+				prog = orig_prog;
+				goto out_addrs;
+			}
+		}
+		oldproglen = proglen;
+		cond_resched();
+	}
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+	if (image) {
+		bpf_jit_binary_lock_ro(header);
+		prog->bpf_func = (void *)image;
+		prog->jited = 1;
+		prog->jited_len = proglen;
+	} else {
+		prog = orig_prog;
+	}
+
+out_addrs:
+	kfree(addrs);
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
+	return prog;
+}
-- 
cgit 


From e782bdcf58c5ace7b7d58b2436177de9785a18e8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:16 +0200
Subject: bpf, x64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from x64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/Makefile       |   3 +-
 arch/x86/net/bpf_jit.S      | 154 --------------------------------------------
 arch/x86/net/bpf_jit_comp.c | 144 ++---------------------------------------
 3 files changed, 5 insertions(+), 296 deletions(-)
 delete mode 100644 arch/x86/net/bpf_jit.S

(limited to 'arch/x86/net')

diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index c6b464a261bb..59e123da580c 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -5,6 +5,5 @@
 ifeq ($(CONFIG_X86_32),y)
         obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
 else
-        OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
-        obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
 endif
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
deleted file mode 100644
index b33093f84528..000000000000
--- a/arch/x86/net/bpf_jit.S
+++ /dev/null
@@ -1,154 +0,0 @@
-/* bpf_jit.S : BPF JIT helper functions
- *
- * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-/*
- * Calling convention :
- * rbx : skb pointer (callee saved)
- * esi : offset of byte(s) to fetch in skb (can be scratched)
- * r10 : copy of skb->data
- * r9d : hlen = skb->len - skb->data_len
- */
-#define SKBDATA	%r10
-#define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
-
-#define FUNC(name) \
-	.globl name; \
-	.type name, @function; \
-	name:
-
-FUNC(sk_load_word)
-	test	%esi,%esi
-	js	bpf_slow_path_word_neg
-
-FUNC(sk_load_word_positive_offset)
-	mov	%r9d,%eax		# hlen
-	sub	%esi,%eax		# hlen - offset
-	cmp	$3,%eax
-	jle	bpf_slow_path_word
-	mov     (SKBDATA,%rsi),%eax
-	bswap   %eax  			/* ntohl() */
-	ret
-
-FUNC(sk_load_half)
-	test	%esi,%esi
-	js	bpf_slow_path_half_neg
-
-FUNC(sk_load_half_positive_offset)
-	mov	%r9d,%eax
-	sub	%esi,%eax		#	hlen - offset
-	cmp	$1,%eax
-	jle	bpf_slow_path_half
-	movzwl	(SKBDATA,%rsi),%eax
-	rol	$8,%ax			# ntohs()
-	ret
-
-FUNC(sk_load_byte)
-	test	%esi,%esi
-	js	bpf_slow_path_byte_neg
-
-FUNC(sk_load_byte_positive_offset)
-	cmp	%esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
-	jle	bpf_slow_path_byte
-	movzbl	(SKBDATA,%rsi),%eax
-	ret
-
-/* rsi contains offset and can be scratched */
-#define bpf_slow_path_common(LEN)		\
-	lea	32(%rbp), %rdx;\
-	FRAME_BEGIN;				\
-	mov	%rbx, %rdi; /* arg1 == skb */	\
-	push	%r9;				\
-	push	SKBDATA;			\
-/* rsi already has offset */			\
-	mov	$LEN,%ecx;	/* len */	\
-	call	skb_copy_bits;			\
-	test    %eax,%eax;			\
-	pop	SKBDATA;			\
-	pop	%r9;				\
-	FRAME_END
-
-
-bpf_slow_path_word:
-	bpf_slow_path_common(4)
-	js	bpf_error
-	mov	32(%rbp),%eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half:
-	bpf_slow_path_common(2)
-	js	bpf_error
-	mov	32(%rbp),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte:
-	bpf_slow_path_common(1)
-	js	bpf_error
-	movzbl	32(%rbp),%eax
-	ret
-
-#define sk_negative_common(SIZE)				\
-	FRAME_BEGIN;						\
-	mov	%rbx, %rdi; /* arg1 == skb */			\
-	push	%r9;						\
-	push	SKBDATA;					\
-/* rsi already has offset */					\
-	mov	$SIZE,%edx;	/* size */			\
-	call	bpf_internal_load_pointer_neg_helper;		\
-	test	%rax,%rax;					\
-	pop	SKBDATA;					\
-	pop	%r9;						\
-	FRAME_END;						\
-	jz	bpf_error
-
-bpf_slow_path_word_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
-	jl	bpf_error	/* offset lower -> error  */
-
-FUNC(sk_load_word_negative_offset)
-	sk_negative_common(4)
-	mov	(%rax), %eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_half_negative_offset)
-	sk_negative_common(2)
-	mov	(%rax),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_byte_negative_offset)
-	sk_negative_common(1)
-	movzbl	(%rax), %eax
-	ret
-
-bpf_error:
-# force a return 0 from jit handler
-	xor	%eax,%eax
-	mov	(%rbp),%rbx
-	mov	8(%rbp),%r13
-	mov	16(%rbp),%r14
-	mov	24(%rbp),%r15
-	add	$40, %rbp
-	leaveq
-	ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 1c3c81ddeb87..ce08b7bb0304 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,15 +17,6 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 
-/*
- * Assembly code in arch/x86/net/bpf_jit.S
- */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[];
-extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[];
-
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -107,9 +98,6 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define X86_JLE 0x7E
 #define X86_JG  0x7F
 
-#define CHOOSE_LOAD_FUNC(K, func) \
-	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
 /* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
@@ -120,8 +108,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- * R9  caches skb->len - skb->data_len
- * R10 caches skb->data, and used for blinding (if enabled)
+ * Also x86-64 register R9 is unused. x86-64 register R10 is
+ * used for blinding (if enabled).
  */
 static const int reg2hex[] = {
 	[BPF_REG_0] = 0,  /* RAX */
@@ -196,19 +184,15 @@ static void jit_fill_hole(void *area, unsigned int size)
 
 struct jit_context {
 	int cleanup_addr; /* Epilogue code offset */
-	bool seen_ld_abs;
-	bool seen_ax_reg;
 };
 
 /* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-#define AUX_STACK_SPACE \
-	(32 /* Space for RBX, R13, R14, R15 */ + \
-	  8 /* Space for skb_copy_bits() buffer */)
+#define AUX_STACK_SPACE		40 /* Space for RBX, R13, R14, R15, tailcnt */
 
-#define PROLOGUE_SIZE 37
+#define PROLOGUE_SIZE		37
 
 /*
  * Emit x86-64 prologue code for BPF program and check its size.
@@ -232,20 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* All classic BPF filters use R6(rbx) save it */
-
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
-
-	/*
-	 * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
-	 * R8(R14). R9(R15) spill could be made conditional, but there is only
-	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
-	 * The overhead of extra spill is negligible for any filter other
-	 * than synthetic ones. Therefore not worth adding complexity.
-	 */
-
 	/* mov qword ptr [rbp+8],r13 */
 	EMIT4(0x4C, 0x89, 0x6D, 8);
 	/* mov qword ptr [rbp+16],r14 */
@@ -353,27 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
-
-static void emit_load_skb_data_hlen(u8 **pprog)
-{
-	u8 *prog = *pprog;
-	int cnt = 0;
-
-	/*
-	 * r9d = skb->len - skb->data_len (headlen)
-	 * r10 = skb->data
-	 */
-	/* mov %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
-
-	/* sub %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
-
-	/* mov %r10, off32(%rdi) */
-	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
-	*pprog = prog;
-}
-
 static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 			   u32 dst_reg, const u32 imm32)
 {
@@ -462,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 {
 	struct bpf_insn *insn = bpf_prog->insnsi;
 	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
 	int i, cnt = 0;
@@ -473,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog));
 
-	if (seen_ld_abs)
-		emit_load_skb_data_hlen(&prog);
-
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
@@ -483,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
-		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
-		if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
-			ctx->seen_ax_reg = seen_ax_reg = true;
-
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -916,36 +858,12 @@ xadd:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_CALL:
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
-			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_pkt_data(func);
-				if (reload_skb_data) {
-					EMIT1(0x57); /* push %rdi */
-					jmp_offset += 22; /* pop, mov, sub, mov */
-				} else {
-					EMIT2(0x41, 0x52); /* push %r10 */
-					EMIT2(0x41, 0x51); /* push %r9 */
-					/*
-					 * We need to adjust jmp offset, since
-					 * pop %r9, pop %r10 take 4 bytes after call insn
-					 */
-					jmp_offset += 4;
-				}
-			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
 			EMIT1_off32(0xE8, jmp_offset);
-			if (seen_ld_abs) {
-				if (reload_skb_data) {
-					EMIT1(0x5F); /* pop %rdi */
-					emit_load_skb_data_hlen(&prog);
-				} else {
-					EMIT2(0x41, 0x59); /* pop %r9 */
-					EMIT2(0x41, 0x5A); /* pop %r10 */
-				}
-			}
 			break;
 
 		case BPF_JMP | BPF_TAIL_CALL:
@@ -1080,60 +998,6 @@ emit_jmp:
 			}
 			break;
 
-		case BPF_LD | BPF_IND | BPF_W:
-			func = sk_load_word;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_W:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
-common_load:
-			ctx->seen_ld_abs = seen_ld_abs = true;
-			jmp_offset = func - (image + addrs[i]);
-			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported BPF func %d addr %p image %p\n",
-				       imm32, func, image);
-				return -EINVAL;
-			}
-			if (BPF_MODE(insn->code) == BPF_ABS) {
-				/* mov %esi, imm32 */
-				EMIT1_off32(0xBE, imm32);
-			} else {
-				/* mov %rsi, src_reg */
-				EMIT_mov(BPF_REG_2, src_reg);
-				if (imm32) {
-					if (is_imm8(imm32))
-						/* add %esi, imm8 */
-						EMIT3(0x83, 0xC6, imm32);
-					else
-						/* add %esi, imm32 */
-						EMIT2_off32(0x81, 0xC6, imm32);
-				}
-			}
-			/*
-			 * skb pointer is in R6 (%rbx), it will be copied into
-			 * %rdi if skb_copy_bits() call is necessary.
-			 * sk_load_* helpers also use %r10 and %r9d.
-			 * See bpf_jit.S
-			 */
-			if (seen_ax_reg)
-				/* r10 = skb->data, mov %r10, off32(%rbx) */
-				EMIT3_off32(0x4c, 0x8b, 0x93,
-					    offsetof(struct sk_buff, data));
-			EMIT1_off32(0xE8, jmp_offset); /* call */
-			break;
-
-		case BPF_LD | BPF_IND | BPF_H:
-			func = sk_load_half;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_H:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
-			goto common_load;
-		case BPF_LD | BPF_IND | BPF_B:
-			func = sk_load_byte;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_B:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
-			goto common_load;
-
 		case BPF_JMP | BPF_EXIT:
 			if (seen_exit) {
 				jmp_offset = ctx->cleanup_addr - addrs[i];
-- 
cgit 


From 24dea04767e6e5175f4750770281b0c17ac6a2fb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:23 +0200
Subject: bpf, x32: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from x32 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp32.c | 136 +-----------------------------------------
 1 file changed, 1 insertion(+), 135 deletions(-)

(limited to 'arch/x86/net')

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 61e61341b777..0cc04e30adc1 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -175,19 +175,13 @@ static const u8 bpf2ia32[][2] = {
 #define SCRATCH_SIZE 96
 
 /* Total stack size used in JITed code */
-#define _STACK_SIZE \
-	(stack_depth + \
-	 + SCRATCH_SIZE + \
-	 + 4 /* Extra space for skb_copy_bits buffer */)
+#define _STACK_SIZE	(stack_depth + SCRATCH_SIZE)
 
 #define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
 
 /* Get the offset of eBPF REGISTERs stored on scratch space. */
 #define STACK_VAR(off) (off)
 
-/* Offset of skb_copy_bits buffer */
-#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
-
 /* Encode 'dst_reg' register into IA32 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
@@ -2276,134 +2270,6 @@ emit_jmp:
 				return -EFAULT;
 			}
 			break;
-
-		case BPF_LD | BPF_ABS | BPF_W:
-		case BPF_LD | BPF_ABS | BPF_H:
-		case BPF_LD | BPF_ABS | BPF_B:
-		case BPF_LD | BPF_IND | BPF_W:
-		case BPF_LD | BPF_IND | BPF_H:
-		case BPF_LD | BPF_IND | BPF_B:
-		{
-			int size;
-			const u8 *r6 = bpf2ia32[BPF_REG_6];
-
-			/* Setting up first argument */
-			/* mov eax,dword ptr [ebp+off] */
-			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
-			      STACK_VAR(r6[0]));
-
-			/* Setting up second argument */
-			if (BPF_MODE(code) == BPF_ABS) {
-				/* mov %edx, imm32 */
-				EMIT1_off32(0xBA, imm32);
-			} else {
-				if (sstk)
-					/* mov edx,dword ptr [ebp+off] */
-					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
-							     IA32_EDX),
-					      STACK_VAR(src_lo));
-				else
-					/* mov edx,src_lo */
-					EMIT2(0x8B, add_2reg(0xC0, src_lo,
-							     IA32_EDX));
-				if (imm32) {
-					if (is_imm8(imm32))
-						/* add %edx,imm8 */
-						EMIT3(0x83, 0xC2, imm32);
-					else
-						/* add %edx,imm32 */
-						EMIT2_off32(0x81, 0xC2, imm32);
-				}
-			}
-
-			/* Setting up third argument */
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				size = 4;
-				break;
-			case BPF_H:
-				size = 2;
-				break;
-			case BPF_B:
-				size = 1;
-				break;
-			default:
-				return -EINVAL;
-			}
-			/* mov ecx,val */
-			EMIT2(0xB1, size);
-			/* movzx ecx,ecx */
-			EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-
-			/* mov ebx,ebp */
-			EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX));
-			/* add %ebx,imm8 */
-			EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER);
-			/* push ebx */
-			EMIT1(0x53);
-
-			/* Setting up function pointer to call */
-			/* mov ebx,imm32*/
-			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX),
-				    (unsigned int)bpf_load_pointer);
-
-			EMIT2(0xFF, add_1reg(0xD0, IA32_EBX));
-			/* add %esp,4 */
-			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4);
-			/* xor edx,edx */
-			EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX));
-
-			/* mov dword ptr [ebp+off],eax */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
-			      STACK_VAR(r0[0]));
-			/* mov dword ptr [ebp+off],edx */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
-			      STACK_VAR(r0[1]));
-
-			/*
-			 * Check if return address is NULL or not.
-			 * If NULL then jump to epilogue else continue
-			 * to load the value from retn address
-			 */
-			EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0);
-			jmp_offset = ctx->cleanup_addr - addrs[i];
-
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				jmp_offset += 7;
-				break;
-			case BPF_H:
-				jmp_offset += 10;
-				break;
-			case BPF_B:
-				jmp_offset += 6;
-				break;
-			}
-
-			EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset);
-			/* Load value from the address */
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				/* mov eax,[eax] */
-				EMIT2(0x8B, 0x0);
-				/* Emit 'bswap eax' */
-				EMIT2(0x0F, add_1reg(0xC8, IA32_EAX));
-				break;
-			case BPF_H:
-				EMIT3(0x0F, 0xB7, 0x0);
-				EMIT1(0x66);
-				EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8);
-				break;
-			case BPF_B:
-				EMIT3(0x0F, 0xB6, 0x0);
-				break;
-			}
-
-			/* mov dword ptr [ebp+off],eax */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
-			      STACK_VAR(r0[0]));
-			break;
-		}
 		/* STX XADD: lock *(u32 *)(dst + off) += src */
 		case BPF_STX | BPF_XADD | BPF_W:
 		/* STX XADD: lock *(u64 *)(dst + off) += src */
-- 
cgit