tools/testing/selftests/netfilter/nft_flowtable.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539

#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This tests basic flowtable functionality.
# Creates following default topology:
#
# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
# Router1 is the one doing flow offloading, Router2 has no special
# purpose other than having a link that is smaller than either Originator
# and responder, i.e. TCPMSS announced values are too large and will still
# result in fragmentation and/or PMTU discovery.
#
# You can check with different Orgininator/Link/Responder MTU eg:
# nft_flowtable.sh -o8000 -l1500 -r2000
#

sfx=$(mktemp -u "XXXXXXXX")
ns1="ns1-$sfx"
ns2="ns2-$sfx"
nsr1="nsr1-$sfx"
nsr2="nsr2-$sfx"

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0

nsin=""
ns1out=""
ns2out=""

log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)

checktool (){
	if ! $1 > /dev/null 2>&1; then
		echo "SKIP: Could not $2"
		exit $ksft_skip
	fi
}

checktool "nft --version" "run test without nft tool"
checktool "ip -Version" "run test without ip tool"
checktool "which nc" "run test without nc (netcat)"
checktool "ip netns add $nsr1" "create net namespace $nsr1"

ip netns add $ns1
ip netns add $ns2
ip netns add $nsr2

cleanup() {
	ip netns del $ns1
	ip netns del $ns2
	ip netns del $nsr1
	ip netns del $nsr2

	rm -f "$nsin" "$ns1out" "$ns2out"

	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
}

trap cleanup EXIT

sysctl -q net.netfilter.nf_log_all_netns=1

ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1
ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2

ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2

for dev in lo veth0 veth1; do
    ip -net $nsr1 link set $dev up
    ip -net $nsr2 link set $dev up
done

ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net $nsr1 addr add dead:1::1/64 dev veth0

ip -net $nsr2 addr add 10.0.2.1/24 dev veth1
ip -net $nsr2 addr add dead:2::1/64 dev veth1

# set different MTUs so we need to push packets coming from ns1 (large MTU)
# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
# or to do PTMU discovery (send ICMP error back to originator).
# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
# is NOT the lowest link mtu.

omtu=9000
lmtu=1500
rmtu=2000

usage(){
	echo "nft_flowtable.sh [OPTIONS]"
	echo
	echo "MTU options"
	echo "   -o originator"
	echo "   -l link"
	echo "   -r responder"
	exit 1
}

while getopts "o:l:r:" o
do
	case $o in
		o) omtu=$OPTARG;;
		l) lmtu=$OPTARG;;
		r) rmtu=$OPTARG;;
		*) usage;;
	esac
done

if ! ip -net $nsr1 link set veth0 mtu $omtu; then
	exit 1
fi

ip -net $ns1 link set eth0 mtu $omtu

if ! ip -net $nsr2 link set veth1 mtu $rmtu; then
	exit 1
fi

ip -net $ns2 link set eth0 mtu $rmtu

# transfer-net between nsr1 and nsr2.
# these addresses are not used for connections.
ip -net $nsr1 addr add 192.168.10.1/24 dev veth1
ip -net $nsr1 addr add fee1:2::1/64 dev veth1

ip -net $nsr2 addr add 192.168.10.2/24 dev veth0
ip -net $nsr2 addr add fee1:2::2/64 dev veth0

for i in 0 1; do
  ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
  ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
done

for ns in $ns1 $ns2;do
  ip -net $ns link set lo up
  ip -net $ns link set eth0 up

  if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
	echo "ERROR: Check Originator/Responder values (problem during address addition)"
	exit 1
  fi
  # don't set ip DF bit for first two tests
  ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
done

ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net $ns2 addr add 10.0.2.99/24 dev eth0
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns2 route add default via 10.0.2.1
ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net $ns2 addr add dead:2::99/64 dev eth0
ip -net $ns1 route add default via dead:1::1
ip -net $ns2 route add default via dead:2::1

ip -net $nsr1 route add default via 192.168.10.2
ip -net $nsr2 route add default via 192.168.10.1

ip netns exec $nsr1 nft -f - <<EOF
table inet filter {
  flowtable f1 {
     hook ingress priority 0
     devices = { veth0, veth1 }
   }

   counter routed_orig { }
   counter routed_repl { }

   chain forward {
      type filter hook forward priority 0; policy drop;

      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
      meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept

      # count packets supposedly offloaded as per direction.
      ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept

      ct state established,related accept

      meta nfproto ipv4 meta l4proto icmp accept
      meta nfproto ipv6 meta l4proto icmpv6 accept
   }
}
EOF

if [ $? -ne 0 ]; then
	echo "SKIP: Could not load nft ruleset"
	exit $ksft_skip
fi

# test basic connectivity
if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
  echo "ERROR: $ns1 cannot reach ns2" 1>&2
  exit 1
fi

if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
  exit 1
fi

if [ $ret -eq 0 ];then
	echo "PASS: netns routing/connectivity: $ns1 can reach $ns2"
fi

nsin=$(mktemp)
ns1out=$(mktemp)
ns2out=$(mktemp)

make_file()
{
	name=$1

	SIZE=$((RANDOM % (1024 * 128)))
	SIZE=$((SIZE + (1024 * 8)))
	TSIZE=$((SIZE * 1024))

	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null

	SIZE=$((RANDOM % 1024))
	SIZE=$((SIZE + 128))
	TSIZE=$((TSIZE + SIZE))
	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
}

check_counters()
{
	local what=$1
	local ok=1

	local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets)
	local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets)

	local orig_cnt=${orig#*bytes}
	local repl_cnt=${repl#*bytes}

	local fs=$(du -sb $nsin)
	local max_orig=${fs%%/*}
	local max_repl=$((max_orig/4))

	if [ $orig_cnt -gt $max_orig ];then
		echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2
		ret=1
		ok=0
	fi

	if [ $repl_cnt -gt $max_repl ];then
		echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2
		ret=1
		ok=0
	fi

	if [ $ok -eq 1 ]; then
		echo "PASS: $what"
	fi
}

check_transfer()
{
	in=$1
	out=$2
	what=$3

	if ! cmp "$in" "$out" > /dev/null 2>&1; then
		echo "FAIL: file mismatch for $what" 1>&2
		ls -l "$in"
		ls -l "$out"
		return 1
	fi

	return 0
}

test_tcp_forwarding_ip()
{
	local nsa=$1
	local nsb=$2
	local dstip=$3
	local dstport=$4
	local lret=0

	ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" &
	lpid=$!

	sleep 1
	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" &
	cpid=$!

	sleep 3

	if ps -p $lpid > /dev/null;then
		kill $lpid
	fi

	if ps -p $cpid > /dev/null;then
		kill $cpid
	fi

	wait

	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
		lret=1
	fi

	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
		lret=1
	fi

	return $lret
}

test_tcp_forwarding()
{
	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345

	return $?
}

test_tcp_forwarding_nat()
{
	local lret
	local pmtu

	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
	lret=$?

	pmtu=$3
	what=$4

	if [ $lret -eq 0 ] ; then
		if [ $pmtu -eq 1 ] ;then
			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
		else
			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
		fi

		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
		lret=$?
		if [ $pmtu -eq 1 ] ;then
			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
		elif [ $lret -eq 0 ] ; then
			echo "PASS: flow offload for ns1/ns2 with dnat $what"
		fi
	fi

	return $lret
}

make_file "$nsin"

# First test:
# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
# Due to MTU mismatch in both directions, all packets (except small packets like pure
# acks) have to be handled by normal forwarding path.  Therefore, packet counters
# are not checked.
if test_tcp_forwarding $ns1 $ns2; then
	echo "PASS: flow offloaded for ns1/ns2"
else
	echo "FAIL: flow offload for ns1/ns2:" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# delete default route, i.e. ns2 won't be able to reach ns1 and
# will depend on ns1 being masqueraded in nsr1.
# expect ns1 has nsr1 address.
ip -net $ns2 route del default via 10.0.2.1
ip -net $ns2 route del default via dead:2::1
ip -net $ns2 route add 192.168.10.1 via 10.0.2.1

# Second test:
# Same, but with NAT enabled.  Same as in first test: we expect normal forward path
# to handle most packets.
ip netns exec $nsr1 nft -f - <<EOF
table ip nat {
   chain prerouting {
      type nat hook prerouting priority 0; policy accept;
      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
   }

   chain postrouting {
      type nat hook postrouting priority 0; policy accept;
      meta oifname "veth1" counter masquerade
   }
}
EOF

if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then
	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# Third test:
# Same as second test, but with PMTU discovery enabled. This
# means that we expect the fastpath to handle packets as soon
# as the endpoints adjust the packet size.
ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null

# reset counters.
# With pmtu in-place we'll also check that nft counters
# are lower than file size and packets were forwarded via flowtable layer.
# For earlier tests (large mtus), packets cannot be handled via flowtable
# (except pure acks and other small packets).
ip netns exec $nsr1 nft reset counters table inet filter >/dev/null

if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then
	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
	ip netns exec $nsr1 nft list ruleset
fi

# Another test:
# Add bridge interface br0 to Router1, with NAT enabled.
ip -net $nsr1 link add name br0 type bridge
ip -net $nsr1 addr flush dev veth0
ip -net $nsr1 link set up dev veth0
ip -net $nsr1 link set veth0 master br0
ip -net $nsr1 addr add 10.0.1.1/24 dev br0
ip -net $nsr1 addr add dead:1::1/64 dev br0
ip -net $nsr1 link set up dev br0

ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null

# br0 with NAT enabled.
ip netns exec $nsr1 nft -f - <<EOF
flush table ip nat
table ip nat {
   chain prerouting {
      type nat hook prerouting priority 0; policy accept;
      meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
   }

   chain postrouting {
      type nat hook postrouting priority 0; policy accept;
      meta oifname "veth1" counter masquerade
   }
}
EOF

if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then
	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi


# Another test:
# Add bridge interface br0 to Router1, with NAT and VLAN.
ip -net $nsr1 link set veth0 nomaster
ip -net $nsr1 link set down dev veth0
ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10
ip -net $nsr1 link set up dev veth0
ip -net $nsr1 link set up dev veth0.10
ip -net $nsr1 link set veth0.10 master br0

ip -net $ns1 addr flush dev eth0
ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10
ip -net $ns1 link set eth0 up
ip -net $ns1 link set eth0.10 up
ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns1 addr add dead:1::99/64 dev eth0.10

if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then
	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
	ip netns exec $nsr1 nft list ruleset
	ret=1
fi

# restore test topology (remove bridge and VLAN)
ip -net $nsr1 link set veth0 nomaster
ip -net $nsr1 link set veth0 down
ip -net $nsr1 link set veth0.10 down
ip -net $nsr1 link delete veth0.10 type vlan
ip -net $nsr1 link delete br0 type bridge
ip -net $ns1 addr flush dev eth0.10
ip -net $ns1 link set eth0.10 down
ip -net $ns1 link set eth0 down
ip -net $ns1 link delete eth0.10 type vlan

# restore address in ns1 and nsr1
ip -net $ns1 link set eth0 up
ip -net $ns1 addr add 10.0.1.99/24 dev eth0
ip -net $ns1 route add default via 10.0.1.1
ip -net $ns1 addr add dead:1::99/64 dev eth0
ip -net $ns1 route add default via dead:1::1
ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
ip -net $nsr1 addr add dead:1::1/64 dev veth0
ip -net $nsr1 link set up dev veth0

KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
SPI1=$RANDOM
SPI2=$RANDOM

if [ $SPI1 -eq $SPI2 ]; then
	SPI2=$((SPI2+1))
fi

do_esp() {
    local ns=$1
    local me=$2
    local remote=$3
    local lnet=$4
    local rnet=$5
    local spi_out=$6
    local spi_in=$7

    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet

    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
    # to fwd decrypted packets after esp processing:
    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow

}

do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2

do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1

ip netns exec $nsr1 nft delete table ip nat

# restore default routes
ip -net $ns2 route del 192.168.10.1 via 10.0.2.1
ip -net $ns2 route add default via 10.0.2.1
ip -net $ns2 route add default via dead:2::1

if test_tcp_forwarding $ns1 $ns2; then
	check_counters "ipsec tunnel mode for ns1/ns2"
else
	echo "FAIL: ipsec tunnel mode for ns1/ns2"
	ip netns exec $nsr1 nft list ruleset 1>&2
	ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2
fi

exit $ret